diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 35ccb71ab5..317124f8c8 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: get_docs_changes: uses: ./.github/workflows/get_docs_changes.yml @@ -17,9 +21,10 @@ jobs: needs: get_docs_changes if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' strategy: - fail-fast: false + fail-fast: true matrix: - os: ["ubuntu-latest", "macos-latest", "windows-latest"] + os: + - ubuntu-latest python-version: ["3.8.x", "3.9.x", "3.10.x", "3.11.x"] defaults: @@ -75,4 +80,4 @@ jobs: - name: Check matrix job results if: contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') run: | - echo "One or more matrix job tests failed or were cancelled. You may need to re-run them." && exit 1 + echo "One or more matrix job tests failed or were cancelled. You may need to re-run them." && exit 1 diff --git a/.github/workflows/test_airflow.yml b/.github/workflows/test_airflow.yml index bbed326344..02513618d6 100644 --- a/.github/workflows/test_airflow.yml +++ b/.github/workflows/test_airflow.yml @@ -7,6 +7,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: get_docs_changes: uses: ./.github/workflows/get_docs_changes.yml diff --git a/.github/workflows/test_build_images.yml b/.github/workflows/test_build_images.yml index 9668e23cb3..489d776f40 100644 --- a/.github/workflows/test_build_images.yml +++ b/.github/workflows/test_build_images.yml @@ -7,6 +7,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: get_docs_changes: uses: ./.github/workflows/get_docs_changes.yml diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 2160025ea0..2d96d2eb95 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -1,4 +1,3 @@ - name: test common on: @@ -8,6 +7,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: RUNTIME__LOG_LEVEL: ERROR @@ -92,6 +95,19 @@ jobs: name: Run smoke tests with minimum deps Windows shell: cmd + - name: Install pyarrow + run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk + + - run: | + poetry run pytest tests/pipeline/test_pipeline_extra.py -k arrow + if: runner.os != 'Windows' + name: Run pipeline tests with pyarrow but no pandas installed + - run: | + poetry run pytest tests/pipeline/test_pipeline_extra.py -k arrow + if: runner.os == 'Windows' + name: Run pipeline tests with pyarrow but no pandas installed Windows + shell: cmd + - name: Install pipeline dependencies run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk --with pipeline diff --git a/.github/workflows/test_dbt_cloud.yml b/.github/workflows/test_dbt_cloud.yml index 2d06ac96ba..a123e051e8 100644 --- a/.github/workflows/test_dbt_cloud.yml +++ b/.github/workflows/test_dbt_cloud.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: # all credentials must be present to be passed to dbt cloud DBT_CLOUD__ACCOUNT_ID: ${{ secrets.DBT_CLOUD__ACCOUNT_ID }} diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml index 5ae791c979..1c425f14e9 100644 --- a/.github/workflows/test_dbt_runner.yml +++ b/.github/workflows/test_dbt_runner.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index e9e17edefe..b94bdc6ee2 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index d77e35f088..acb5f35dfd 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index 00027768a5..d11f7155d4 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} diff --git a/.github/workflows/test_destination_databricks.yml b/.github/workflows/test_destination_databricks.yml index f301a1b9ed..2a2fa8e10d 100644 --- a/.github/workflows/test_destination_databricks.yml +++ b/.github/workflows/test_destination_databricks.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index d1da25c067..f96c64219d 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} diff --git a/.github/workflows/test_destination_qdrant.yml b/.github/workflows/test_destination_qdrant.yml index 758c18b56b..3237801dbf 100644 --- a/.github/workflows/test_destination_qdrant.yml +++ b/.github/workflows/test_destination_qdrant.yml @@ -7,6 +7,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} @@ -28,7 +32,8 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest", "macos-latest", "windows-latest"] + os: + - ubuntu-latest defaults: run: shell: bash @@ -64,13 +69,7 @@ jobs: run: poetry install --no-interaction -E qdrant -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load/ - if: runner.os != 'Windows' - name: Run tests Linux/MAC - - run: | - poetry run pytest tests/load/ - if: runner.os == 'Windows' - name: Run tests Windows - shell: cmd + name: Run tests Linux matrix_job_required_check: name: Qdrant loader tests diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index 979ea3e917..1ef290682c 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index ecd890d32a..774c83314f 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -7,6 +7,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} @@ -24,7 +28,7 @@ jobs: run_loader: name: Tests Synapse loader needs: get_docs_changes - if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' strategy: fail-fast: false matrix: diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index c60d870b05..a635e2865c 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index d73c109894..7a862c5800 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -8,6 +8,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} @@ -54,7 +58,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant --with docs,sentry-sdk --without airflow + run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery --with docs,sentry-sdk --without airflow - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > docs/website/docs/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index a02957b69d..11377095d0 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -10,6 +10,10 @@ on: - devel workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index afd0a00d4a..895ad08229 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,6 +12,15 @@ Thank you for considering contributing to **dlt**! We appreciate your help in ma 6. [Publishing (Maintainers Only)](#publishing-maintainers-only) 7. [Resources](#resources) +## Before You Begin + +- **Proposing significant changes or enhancements**: If you're thinking about making significant changes, make sure to [submit an issue](https://github.com/dlt-hub/dlt/issues/new/choose) first. This ensures your efforts align with the project's direction and that you don't invest time on a feature that may not be merged. + +- **Fixing bugs**: + - **Check existing issues**: search [open issues](https://github.com/dlt-hub/dlt/issues) to see if the bug you've found is already reported. + - If **not reported**, [create a new issue](https://github.com/dlt-hub/dlt/issues/new/choose). You're more than welcome to fix it and submit a pull request with your solution. Thank you! + - If the bug is **already reported**, please leave a comment on that issue stating you're working on fixing it. This helps keep everyone updated and avoids duplicate efforts. + ## Getting Started To get started, follow these steps: diff --git a/Makefile b/Makefile index 5aa2b2786c..ebf633d1eb 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,8 @@ dev: has-poetry poetry install --all-extras --with airflow --with docs --with providers --with pipeline --with sentry-sdk lint: - ./check-package.sh + ./tools/check-package.sh + poetry run python ./tools/check-lockfile.py poetry run mypy --config-file mypy.ini dlt tests poetry run flake8 --max-line-length=200 dlt poetry run flake8 --max-line-length=200 tests --exclude tests/reflection/module_cases @@ -60,8 +61,9 @@ format: # poetry run isort ./ test-and-lint-snippets: - poetry run mypy --config-file mypy.ini docs/website docs/examples - poetry run flake8 --max-line-length=200 docs/website docs/examples + cd docs/tools && poetry run python check_embedded_snippets.py full + poetry run mypy --config-file mypy.ini docs/website docs/examples docs/tools --exclude docs/tools/lint_setup + poetry run flake8 --max-line-length=200 docs/website docs/examples docs/tools cd docs/website/docs && poetry run pytest --ignore=node_modules lint-security: diff --git a/README.md b/README.md index 60c5c2f385..5cb681c570 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ Load chess game data from chess.com API and save it in DuckDB: ```python import dlt from dlt.sources.helpers import requests + # Create a dlt pipeline that will load # chess player data to the DuckDB destination pipeline = dlt.pipeline( @@ -48,12 +49,14 @@ pipeline = dlt.pipeline( destination='duckdb', dataset_name='player_data' ) + # Grab some player data from Chess.com API data = [] for player in ['magnuscarlsen', 'rpragchess']: response = requests.get(f'https://api.chess.com/pub/player/{player}') response.raise_for_status() data.append(response.json()) + # Extract, normalize, and load the data pipeline.run(data, table_name='player') ``` diff --git a/dlt/__init__.py b/dlt/__init__.py index e2a6b1a3a7..eee105e47e 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -29,6 +29,8 @@ from dlt import sources from dlt.extract.decorators import source, resource, transformer, defer +from dlt.destinations.decorators import destination + from dlt.pipeline import ( pipeline as _pipeline, run, @@ -62,6 +64,7 @@ "resource", "transformer", "defer", + "destination", "pipeline", "run", "attach", diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index 9894227046..2332c0286c 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -443,6 +443,12 @@ def main() -> int: pipe_cmd.add_argument( "--list-pipelines", "-l", default=False, action="store_true", help="List local pipelines" ) + pipe_cmd.add_argument( + "--hot-reload", + default=False, + action="store_true", + help="Reload streamlit app (for core development)", + ) pipe_cmd.add_argument("pipeline_name", nargs="?", help="Pipeline name") pipe_cmd.add_argument("--pipelines-dir", help="Pipelines working directory", default=None) pipe_cmd.add_argument( diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py index 9981fa8493..0eb73ad7a8 100644 --- a/dlt/cli/pipeline_command.py +++ b/dlt/cli/pipeline_command.py @@ -1,5 +1,5 @@ import yaml -from typing import Any, Sequence, Tuple +from typing import Any, Optional, Sequence, Tuple import dlt from dlt.cli.exceptions import CliCommandException @@ -15,6 +15,7 @@ from dlt.cli import echo as fmt + DLT_PIPELINE_COMMAND_DOCS_URL = "https://dlthub.com/docs/reference/command-line-interface" @@ -25,6 +26,7 @@ def pipeline_command( verbosity: int, dataset_name: str = None, destination: TDestinationReferenceArg = None, + hot_reload: Optional[bool] = False, **command_kwargs: Any, ) -> None: if operation == "list": @@ -48,7 +50,8 @@ def pipeline_command( raise fmt.warning(str(e)) if not fmt.confirm( - "Do you want to attempt to restore the pipeline state from destination?", default=False + "Do you want to attempt to restore the pipeline state from destination?", + default=False, ): return destination = destination or fmt.text_input( @@ -58,7 +61,10 @@ def pipeline_command( f"Enter dataset name for pipeline {fmt.bold(pipeline_name)}" ) p = dlt.pipeline( - pipeline_name, pipelines_dir, destination=destination, dataset_name=dataset_name + pipeline_name, + pipelines_dir, + destination=destination, + dataset_name=dataset_name, ) p.sync_destination() if p.first_run: @@ -101,13 +107,29 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: if operation == "show": from dlt.common.runtime import signals - from dlt.helpers import streamlit_helper + from dlt.helpers.streamlit_app import index with signals.delayed_signals(): + streamlit_cmd = [ + "streamlit", + "run", + index.__file__, + "--client.showSidebarNavigation", + "false", + ] + + if hot_reload: + streamlit_cmd.append("--server.runOnSave") + streamlit_cmd.append("true") + + streamlit_cmd.append("--") + streamlit_cmd.append(pipeline_name) + if pipelines_dir: + streamlit_cmd.append("--pipelines-dir") + streamlit_cmd.append(pipelines_dir) + venv = Venv.restore_current() - for line in iter_stdout( - venv, "streamlit", "run", streamlit_helper.__file__, pipeline_name - ): + for line in iter_stdout(venv, *streamlit_cmd): fmt.echo(line) if operation == "info": @@ -255,7 +277,12 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: tables = remove_defaults({"tables": package_info.schema_update}) # type: ignore fmt.echo(fmt.bold("Schema update:")) fmt.echo( - yaml.dump(tables, allow_unicode=True, default_flow_style=False, sort_keys=False) + yaml.dump( + tables, + allow_unicode=True, + default_flow_style=False, + sort_keys=False, + ) ) if operation == "schema": @@ -288,20 +315,33 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: fmt.echo( "About to drop the following data in dataset %s in destination %s:" - % (fmt.bold(drop.info["dataset_name"]), fmt.bold(p.destination.destination_name)) + % ( + fmt.bold(drop.info["dataset_name"]), + fmt.bold(p.destination.destination_name), + ) ) fmt.echo("%s: %s" % (fmt.style("Selected schema", fg="green"), drop.info["schema_name"])) fmt.echo( - "%s: %s" % (fmt.style("Selected resource(s)", fg="green"), drop.info["resource_names"]) + "%s: %s" + % ( + fmt.style("Selected resource(s)", fg="green"), + drop.info["resource_names"], + ) ) fmt.echo("%s: %s" % (fmt.style("Table(s) to drop", fg="green"), drop.info["tables"])) fmt.echo( "%s: %s" - % (fmt.style("Resource(s) state to reset", fg="green"), drop.info["resource_states"]) + % ( + fmt.style("Resource(s) state to reset", fg="green"), + drop.info["resource_states"], + ) ) fmt.echo( "%s: %s" - % (fmt.style("Source state path(s) to reset", fg="green"), drop.info["state_paths"]) + % ( + fmt.style("Source state path(s) to reset", fg="green"), + drop.info["state_paths"], + ) ) # for k, v in drop.info.items(): # fmt.echo("%s: %s" % (fmt.style(k, fg="green"), v)) diff --git a/dlt/common/configuration/__init__.py b/dlt/common/configuration/__init__.py index b7d868ff8b..8de57f7799 100644 --- a/dlt/common/configuration/__init__.py +++ b/dlt/common/configuration/__init__.py @@ -1,7 +1,7 @@ from .specs.base_configuration import configspec, is_valid_hint, is_secret_hint, resolve_type from .specs import known_sections from .resolve import resolve_configuration, inject_section -from .inject import with_config, last_config, get_fun_spec +from .inject import with_config, last_config, get_fun_spec, create_resolved_partial from .exceptions import ( ConfigFieldMissingException, diff --git a/dlt/common/configuration/container.py b/dlt/common/configuration/container.py index ad20765489..441b0e21bc 100644 --- a/dlt/common/configuration/container.py +++ b/dlt/common/configuration/container.py @@ -1,7 +1,7 @@ -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext, AbstractContextManager import re import threading -from typing import ClassVar, Dict, Iterator, Tuple, Type, TypeVar +from typing import ClassVar, Dict, Iterator, Tuple, Type, TypeVar, Any from dlt.common.configuration.specs.base_configuration import ContainerInjectableContext from dlt.common.configuration.exceptions import ( @@ -34,6 +34,9 @@ class Container: thread_contexts: Dict[int, Dict[Type[ContainerInjectableContext], ContainerInjectableContext]] """A thread aware mapping of injection context """ + _context_container_locks: Dict[str, threading.Lock] + """Locks for container types on threads.""" + main_context: Dict[Type[ContainerInjectableContext], ContainerInjectableContext] """Injection context for the main thread""" @@ -41,6 +44,7 @@ def __new__(cls: Type["Container"]) -> "Container": if not cls._INSTANCE: cls._INSTANCE = super().__new__(cls) cls._INSTANCE.thread_contexts = {} + cls._INSTANCE._context_container_locks = {} cls._INSTANCE.main_context = cls._INSTANCE.thread_contexts[ Container._MAIN_THREAD_ID ] = {} @@ -84,22 +88,22 @@ def _thread_context( self, spec: Type[TConfiguration] ) -> Dict[Type[ContainerInjectableContext], ContainerInjectableContext]: if spec.global_affinity: - context = self.main_context + return self.main_context else: # thread pool names used in dlt contain originating thread id. use this id over pool id if m := re.match(r"dlt-pool-(\d+)-", threading.currentThread().getName()): thread_id = int(m.group(1)) else: thread_id = threading.get_ident() + # return main context for main thread if thread_id == Container._MAIN_THREAD_ID: return self.main_context # we may add a new empty thread context so lock here with Container._LOCK: - context = self.thread_contexts.get(thread_id) - if context is None: + if (context := self.thread_contexts.get(thread_id)) is None: context = self.thread_contexts[thread_id] = {} - return context + return context def _thread_getitem( self, spec: Type[TConfiguration] @@ -127,29 +131,44 @@ def _thread_delitem( del context[spec] @contextmanager - def injectable_context(self, config: TConfiguration) -> Iterator[TConfiguration]: + def injectable_context( + self, config: TConfiguration, lock_context: bool = False + ) -> Iterator[TConfiguration]: """A context manager that will insert `config` into the container and restore the previous value when it gets out of scope.""" + config.resolve() spec = type(config) previous_config: ContainerInjectableContext = None - context, previous_config = self._thread_getitem(spec) - - # set new config and yield context - self._thread_setitem(context, spec, config) - try: - yield config - finally: - # before setting the previous config for given spec, check if there was no overlapping modification - context, current_config = self._thread_getitem(spec) - if current_config is config: - # config is injected for spec so restore previous - if previous_config is None: - self._thread_delitem(context, spec) + context = self._thread_context(spec) + lock: AbstractContextManager[Any] + + # if there is a lock_id, we need a lock for the lock_id in the scope of the current context + if lock_context: + lock_key = f"{id(context)}" + if (lock := self._context_container_locks.get(lock_key)) is None: + with Container._LOCK: + self._context_container_locks[lock_key] = lock = threading.Lock() + else: + lock = nullcontext() + + with lock: + # remember context and set item + previous_config = context.get(spec) + self._thread_setitem(context, spec, config) + try: + yield config + finally: + # before setting the previous config for given spec, check if there was no overlapping modification + context, current_config = self._thread_getitem(spec) + if current_config is config: + # config is injected for spec so restore previous + if previous_config is None: + self._thread_delitem(context, spec) + else: + self._thread_setitem(context, spec, previous_config) else: - self._thread_setitem(context, spec, previous_config) - else: - # value was modified in the meantime and not restored - raise ContainerInjectableContextMangled(spec, context[spec], config) + # value was modified in the meantime and not restored + raise ContainerInjectableContextMangled(spec, context[spec], config) @staticmethod def thread_pool_prefix() -> str: diff --git a/dlt/common/configuration/inject.py b/dlt/common/configuration/inject.py index a22f299ae8..6699826ec8 100644 --- a/dlt/common/configuration/inject.py +++ b/dlt/common/configuration/inject.py @@ -1,12 +1,15 @@ import inspect + from functools import wraps -from typing import Callable, Dict, Type, Any, Optional, Tuple, TypeVar, overload +from typing import Callable, Dict, Type, Any, Optional, Tuple, TypeVar, overload, cast from inspect import Signature, Parameter +from contextlib import nullcontext from dlt.common.typing import DictStrAny, StrAny, TFun, AnyFun from dlt.common.configuration.resolve import resolve_configuration, inject_section from dlt.common.configuration.specs.base_configuration import BaseConfiguration from dlt.common.configuration.specs.config_section_context import ConfigSectionContext + from dlt.common.reflection.spec import spec_from_signature @@ -32,6 +35,9 @@ def with_config( auto_pipeline_section: bool = False, include_defaults: bool = True, accept_partial: bool = False, + initial_config: BaseConfiguration = None, + base: Type[BaseConfiguration] = BaseConfiguration, + lock_context_on_injection: bool = True, ) -> TFun: ... @@ -45,6 +51,9 @@ def with_config( auto_pipeline_section: bool = False, include_defaults: bool = True, accept_partial: bool = False, + initial_config: Optional[BaseConfiguration] = None, + base: Type[BaseConfiguration] = BaseConfiguration, + lock_context_on_injection: bool = True, ) -> Callable[[TFun], TFun]: ... @@ -58,6 +67,8 @@ def with_config( include_defaults: bool = True, accept_partial: bool = False, initial_config: Optional[BaseConfiguration] = None, + base: Type[BaseConfiguration] = BaseConfiguration, + lock_context_on_injection: bool = True, ) -> Callable[[TFun], TFun]: """Injects values into decorated function arguments following the specification in `spec` or by deriving one from function's signature. @@ -71,10 +82,12 @@ def with_config( prefer_existing_sections: (bool, optional): When joining existing section context, the existing context will be preferred to the one in `sections`. Default: False auto_pipeline_section (bool, optional): If True, a top level pipeline section will be added if `pipeline_name` argument is present . Defaults to False. include_defaults (bool, optional): If True then arguments with default values will be included in synthesized spec. If False only the required arguments marked with `dlt.secrets.value` and `dlt.config.value` are included - + base (Type[BaseConfiguration], optional): A base class for synthesized spec. Defaults to BaseConfiguration. + lock_context_on_injection (bool, optional): If True, the thread context will be locked during injection to prevent race conditions. Defaults to True. Returns: Callable[[TFun], TFun]: A decorated function """ + section_f: Callable[[StrAny], str] = None # section may be a function from function arguments to section if callable(sections): @@ -83,20 +96,25 @@ def with_config( def decorator(f: TFun) -> TFun: SPEC: Type[BaseConfiguration] = None sig: Signature = inspect.signature(f) + signature_fields: Dict[str, Any] kwargs_arg = next( (p for p in sig.parameters.values() if p.kind == Parameter.VAR_KEYWORD), None ) - spec_arg: Parameter = None - pipeline_name_arg: Parameter = None - if spec is None: - SPEC = spec_from_signature(f, sig, include_defaults) + SPEC, signature_fields = spec_from_signature(f, sig, include_defaults, base=base) else: SPEC = spec + signature_fields = SPEC.get_resolvable_fields() - if SPEC is None: + # if no signature fields were added we will not wrap `f` for injection + if len(signature_fields) == 0: + # always register new function + _FUNC_SPECS[id(f)] = SPEC return f + spec_arg: Parameter = None + pipeline_name_arg: Parameter = None + for p in sig.parameters.values(): # for all positional parameters that do not have default value, set default # if hasattr(SPEC, p.name) and p.default == Parameter.empty: @@ -109,49 +127,52 @@ def decorator(f: TFun) -> TFun: pipeline_name_arg = p pipeline_name_arg_default = None if p.default == Parameter.empty else p.default - @wraps(f) - def _wrap(*args: Any, **kwargs: Any) -> Any: + def resolve_config(bound_args: inspect.BoundArguments) -> BaseConfiguration: + """Resolve arguments using the provided spec""" # bind parameters to signature - bound_args = sig.bind(*args, **kwargs) # for calls containing resolved spec in the kwargs, we do not need to resolve again config: BaseConfiguration = None - if _LAST_DLT_CONFIG in kwargs: - config = last_config(**kwargs) + + # if section derivation function was provided then call it + if section_f: + curr_sections: Tuple[str, ...] = (section_f(bound_args.arguments),) + # sections may be a string + elif isinstance(sections, str): + curr_sections = (sections,) else: - # if section derivation function was provided then call it - if section_f: - curr_sections: Tuple[str, ...] = (section_f(bound_args.arguments),) - # sections may be a string - elif isinstance(sections, str): - curr_sections = (sections,) - else: - curr_sections = sections - - # if one of arguments is spec the use it as initial value - if initial_config: - config = initial_config - elif spec_arg: - config = bound_args.arguments.get(spec_arg.name, None) - # resolve SPEC, also provide section_context with pipeline_name - if pipeline_name_arg: - curr_pipeline_name = bound_args.arguments.get( - pipeline_name_arg.name, pipeline_name_arg_default - ) - else: - curr_pipeline_name = None - section_context = ConfigSectionContext( - pipeline_name=curr_pipeline_name, - sections=curr_sections, - merge_style=sections_merge_style, + curr_sections = sections + + # if one of arguments is spec the use it as initial value + if initial_config: + config = initial_config + elif spec_arg: + config = bound_args.arguments.get(spec_arg.name, None) + # resolve SPEC, also provide section_context with pipeline_name + if pipeline_name_arg: + curr_pipeline_name = bound_args.arguments.get( + pipeline_name_arg.name, pipeline_name_arg_default ) - # this may be called from many threads so section_context is thread affine - with inject_section(section_context): - # print(f"RESOLVE CONF in inject: {f.__name__}: {section_context.sections} vs {sections}") - config = resolve_configuration( - config or SPEC(), - explicit_value=bound_args.arguments, - accept_partial=accept_partial, - ) + else: + curr_pipeline_name = None + section_context = ConfigSectionContext( + pipeline_name=curr_pipeline_name, + sections=curr_sections, + merge_style=sections_merge_style, + ) + + # this may be called from many threads so section_context is thread affine + with inject_section(section_context, lock_context=lock_context_on_injection): + # print(f"RESOLVE CONF in inject: {f.__name__}: {section_context.sections} vs {sections}") + return resolve_configuration( + config or SPEC(), + explicit_value=bound_args.arguments, + accept_partial=accept_partial, + ) + + def update_bound_args( + bound_args: inspect.BoundArguments, config: BaseConfiguration, args: Any, kwargs: Any + ) -> None: + # overwrite or add resolved params resolved_params = dict(config) # overwrite or add resolved params for p in sig.parameters.values(): @@ -167,12 +188,56 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: bound_args.arguments[kwargs_arg.name].update(resolved_params) bound_args.arguments[kwargs_arg.name][_LAST_DLT_CONFIG] = config bound_args.arguments[kwargs_arg.name][_ORIGINAL_ARGS] = (args, kwargs) + + def with_partially_resolved_config(config: Optional[BaseConfiguration] = None) -> Any: + # creates a pre-resolved partial of the decorated function + empty_bound_args = sig.bind_partial() + if not config: + config = resolve_config(empty_bound_args) + + def wrapped(*args: Any, **kwargs: Any) -> Any: + nonlocal config + + # Do we need an exception here? + if spec_arg and spec_arg.name in kwargs: + from dlt.common import logger + + logger.warning( + "Spec argument is provided in kwargs, ignoring it for resolved partial" + " function." + ) + + # we can still overwrite the config + if _LAST_DLT_CONFIG in kwargs: + config = last_config(**kwargs) + + # call the function with the pre-resolved config + bound_args = sig.bind(*args, **kwargs) + update_bound_args(bound_args, config, args, kwargs) + return f(*bound_args.args, **bound_args.kwargs) + + return wrapped + + @wraps(f) + def _wrap(*args: Any, **kwargs: Any) -> Any: + # Resolve config + config: BaseConfiguration = None + bound_args = sig.bind(*args, **kwargs) + if _LAST_DLT_CONFIG in kwargs: + config = last_config(**kwargs) + else: + config = resolve_config(bound_args) + # call the function with resolved config + update_bound_args(bound_args, config, args, kwargs) return f(*bound_args.args, **bound_args.kwargs) # register the spec for a wrapped function _FUNC_SPECS[id(_wrap)] = SPEC + # add a method to create a pre-resolved partial + setattr(_wrap, "__RESOLVED_PARTIAL_FUNC__", with_partially_resolved_config) # noqa: B010 + return _wrap # type: ignore # See if we're being called as @with_config or @with_config(). @@ -197,3 +262,10 @@ def last_config(**kwargs: Any) -> Any: def get_orig_args(**kwargs: Any) -> Tuple[Tuple[Any], DictStrAny]: return kwargs[_ORIGINAL_ARGS] # type: ignore + + +def create_resolved_partial(f: AnyFun, config: Optional[BaseConfiguration] = None) -> AnyFun: + """Create a pre-resolved partial of the with_config decorated function""" + if partial_func := getattr(f, "__RESOLVED_PARTIAL_FUNC__", None): + return cast(AnyFun, partial_func(config)) + return f diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py index db69cd9572..b398f0463a 100644 --- a/dlt/common/configuration/resolve.py +++ b/dlt/common/configuration/resolve.py @@ -92,13 +92,14 @@ def initialize_credentials(hint: Any, initial_value: Any) -> CredentialsConfigur def inject_section( - section_context: ConfigSectionContext, merge_existing: bool = True + section_context: ConfigSectionContext, merge_existing: bool = True, lock_context: bool = False ) -> ContextManager[ConfigSectionContext]: """Context manager that sets section specified in `section_context` to be used during configuration resolution. Optionally merges the context already in the container with the one provided Args: section_context (ConfigSectionContext): Instance providing a pipeline name and section context merge_existing (bool, optional): Merges existing section context with `section_context` in the arguments by executing `merge_style` function on `section_context`. Defaults to True. + lock_context (bool, optional): Instruct to threadlock the current thread to prevent race conditions in context injection. Default Merge Style: Gets `pipeline_name` and `sections` from existing context if they are not provided in `section_context` argument. @@ -112,7 +113,7 @@ def inject_section( if merge_existing: section_context.merge(existing_context) - return container.injectable_context(section_context) + return container.injectable_context(section_context, lock_context=lock_context) def _maybe_parse_native_value( diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 84f59fa894..62abf42f27 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -2,6 +2,7 @@ import inspect import contextlib import dataclasses + from collections.abc import Mapping as C_Mapping from typing import ( Callable, diff --git a/dlt/common/data_types/type_helpers.py b/dlt/common/data_types/type_helpers.py index 659b4951df..61a0aa1dbf 100644 --- a/dlt/common/data_types/type_helpers.py +++ b/dlt/common/data_types/type_helpers.py @@ -7,7 +7,7 @@ from enum import Enum from dlt.common import pendulum, json, Decimal, Wei -from dlt.common.json import custom_pua_remove +from dlt.common.json import custom_pua_remove, json from dlt.common.json._simplejson import custom_encode as json_custom_encode from dlt.common.arithmetics import InvalidOperation from dlt.common.data_types.typing import TDataType @@ -105,6 +105,14 @@ def coerce_value(to_type: TDataType, from_type: TDataType, value: Any) -> Any: return int(value.value) return value + if to_type == "complex": + # try to coerce from text + if from_type == "text": + try: + return json.loads(value) + except Exception: + raise ValueError(value) + if to_type == "text": if from_type == "complex": return complex_to_str(value) diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index a78a31fdf3..36a9cc3b6e 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -19,7 +19,7 @@ ] ALL_SUPPORTED_FILE_FORMATS: Set[TLoaderFileFormat] = set(get_args(TLoaderFileFormat)) # file formats used internally by dlt -INTERNAL_LOADER_FILE_FORMATS: Set[TLoaderFileFormat] = {"puae-jsonl", "sql", "reference", "arrow"} +INTERNAL_LOADER_FILE_FORMATS: Set[TLoaderFileFormat] = {"sql", "reference", "arrow"} # file formats that may be chosen by the user EXTERNAL_LOADER_FILE_FORMATS: Set[TLoaderFileFormat] = ( set(get_args(TLoaderFileFormat)) - INTERNAL_LOADER_FILE_FORMATS @@ -55,6 +55,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): insert_values_writer_type: str = "default" supports_multiple_statements: bool = True supports_clone_table: bool = False + max_table_nesting: Optional[int] = None # destination can overwrite max table nesting """Destination supports CREATE TABLE ... CLONE ... statements""" # do not allow to create default value, destination caps must be always explicitly inserted into container diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 5e698347e5..258efd80be 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -260,6 +260,27 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]: return [] +class DoNothingJob(LoadJob): + """The most lazy class of dlt""" + + def __init__(self, file_path: str) -> None: + super().__init__(FileStorage.get_file_name_from_file_path(file_path)) + + def state(self) -> TLoadJobState: + # this job is always done + return "completed" + + def exception(self) -> str: + # this part of code should be never reached + raise NotImplementedError() + + +class DoNothingFollowupJob(DoNothingJob, FollowupJob): + """The second most lazy class of dlt""" + + pass + + class JobClientBase(ABC): capabilities: ClassVar[DestinationCapabilitiesContext] = None diff --git a/dlt/common/libs/pandas.py b/dlt/common/libs/pandas.py index 93e6b764bc..7a94dcf6e2 100644 --- a/dlt/common/libs/pandas.py +++ b/dlt/common/libs/pandas.py @@ -1,7 +1,14 @@ +from typing import Any from dlt.common.exceptions import MissingDependencyException try: import pandas - from pandas.io.sql import _wrap_result except ModuleNotFoundError: raise MissingDependencyException("DLT Pandas Helpers", ["pandas"]) + + +def pandas_to_arrow(df: pandas.DataFrame) -> Any: + """Converts pandas to arrow or raises an exception if pyarrow is not installed""" + from dlt.common.libs.pyarrow import pyarrow as pa + + return pa.Table.from_pandas(df) diff --git a/dlt/common/libs/pandas_sql.py b/dlt/common/libs/pandas_sql.py new file mode 100644 index 0000000000..e9e2a7da11 --- /dev/null +++ b/dlt/common/libs/pandas_sql.py @@ -0,0 +1,7 @@ +from dlt.common.exceptions import MissingDependencyException + + +try: + from pandas.io.sql import _wrap_result +except ModuleNotFoundError: + raise MissingDependencyException("dlt pandas helper for sql", ["pandas"]) diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 183c27954b..c1fbfbff85 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -18,7 +18,9 @@ import pyarrow.compute except ModuleNotFoundError: raise MissingDependencyException( - "dlt parquet Helpers", [f"{version.DLT_PKG_NAME}[parquet]"], "dlt Helpers for for parquet." + "dlt pyarrow helpers", + [f"{version.DLT_PKG_NAME}[parquet]"], + "Install pyarrow to be allow to load arrow tables, panda frames and to use parquet files.", ) diff --git a/dlt/common/normalizers/configuration.py b/dlt/common/normalizers/configuration.py index 6957417f9d..adeefe2237 100644 --- a/dlt/common/normalizers/configuration.py +++ b/dlt/common/normalizers/configuration.py @@ -5,7 +5,7 @@ from dlt.common.configuration.specs import BaseConfiguration from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.normalizers.typing import TJSONNormalizer -from dlt.common.typing import StrAny +from dlt.common.typing import DictStrAny @configspec @@ -14,7 +14,7 @@ class NormalizersConfiguration(BaseConfiguration): __section__: str = "schema" naming: Optional[str] = None - json_normalizer: Optional[StrAny] = None + json_normalizer: Optional[DictStrAny] = None destination_capabilities: Optional[DestinationCapabilitiesContext] = None # injectable def on_resolved(self) -> None: @@ -22,6 +22,16 @@ def on_resolved(self) -> None: if self.naming is None: if self.destination_capabilities: self.naming = self.destination_capabilities.naming_convention + # if max_table_nesting is set, we need to set the max_table_nesting in the json_normalizer + if ( + self.destination_capabilities + and self.destination_capabilities.max_table_nesting is not None + ): + self.json_normalizer = self.json_normalizer or {} + self.json_normalizer.setdefault("config", {}) + self.json_normalizer["config"][ + "max_nesting" + ] = self.destination_capabilities.max_table_nesting if TYPE_CHECKING: diff --git a/dlt/common/normalizers/utils.py b/dlt/common/normalizers/utils.py index dde78edede..645bad2bea 100644 --- a/dlt/common/normalizers/utils.py +++ b/dlt/common/normalizers/utils.py @@ -34,9 +34,11 @@ def import_normalizers( """ # add defaults to normalizer_config normalizers_config["names"] = names = normalizers_config["names"] or "snake_case" - normalizers_config["json"] = item_normalizer = normalizers_config["json"] or { - "module": "dlt.common.normalizers.json.relational" - } + # set default json normalizer module + normalizers_config["json"] = item_normalizer = normalizers_config.get("json") or {} + if "module" not in item_normalizer: + item_normalizer["module"] = "dlt.common.normalizers.json.relational" + try: if "." in names: # TODO: bump schema engine version and migrate schema. also change the name in TNormalizersConfig from names to naming diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index df221ec703..3cbaafefbe 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -3,6 +3,7 @@ import datetime # noqa: 251 import humanize import contextlib + from typing import ( Any, Callable, @@ -40,11 +41,15 @@ from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo +from dlt.common.storages.load_package import PackageStorage + from dlt.common.time import ensure_pendulum_datetime, precise_time from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize from dlt.common.jsonpath import delete_matches, TAnyJsonPath from dlt.common.data_writers.writers import DataWriterMetrics, TLoaderFileFormat from dlt.common.utils import RowCounts, merge_row_counts +from dlt.common.versioned_state import TVersionedState +from dlt.common.storages.load_package import TLoadPackageState class _StepInfo(NamedTuple): @@ -454,7 +459,7 @@ class TPipelineLocalState(TypedDict, total=False): """Hash of state that was recently synced with destination""" -class TPipelineState(TypedDict, total=False): +class TPipelineState(TVersionedState, total=False): """Schema for a pipeline state that is stored within the pipeline working directory""" pipeline_name: str @@ -469,9 +474,6 @@ class TPipelineState(TypedDict, total=False): staging_type: Optional[str] # properties starting with _ are not automatically applied to pipeline object when state is restored - _state_version: int - _version_hash: str - _state_engine_version: int _local: TPipelineLocalState """A section of state that is not synchronized with the destination and does not participate in change merging and version control""" diff --git a/dlt/common/reflection/spec.py b/dlt/common/reflection/spec.py index 0a486088c8..5c39199f63 100644 --- a/dlt/common/reflection/spec.py +++ b/dlt/common/reflection/spec.py @@ -1,6 +1,6 @@ import re import inspect -from typing import Dict, List, Type, Any, Optional, NewType +from typing import Dict, List, Tuple, Type, Any, Optional, NewType from inspect import Signature, Parameter from dlt.common.typing import AnyType, AnyFun, TSecretValue @@ -26,15 +26,31 @@ def _first_up(s: str) -> str: def spec_from_signature( - f: AnyFun, sig: Signature, include_defaults: bool = True -) -> Type[BaseConfiguration]: + f: AnyFun, + sig: Signature, + include_defaults: bool = True, + base: Type[BaseConfiguration] = BaseConfiguration, +) -> Tuple[Type[BaseConfiguration], Dict[str, Any]]: + """Creates a SPEC on base `base1 for a function `f` with signature `sig`. + + All the arguments in `sig` that are valid SPEC hints and have defaults will be part of the SPEC. + Special markers for required SPEC fields `dlt.secrets.value` and `dlt.config.value` are parsed using + module source code, which is a hack and will not work for modules not imported from a file. + + The name of a SPEC type is inferred from qualname of `f` and type will refer to `f` module and is unique + for a module. NOTE: the SPECS are cached in the module by using name as an id. + + Return value is a tuple of SPEC and SPEC fields created from a `sig`. + """ name = _get_spec_name_from_f(f) module = inspect.getmodule(f) + base_fields = base.get_resolvable_fields() # check if spec for that function exists spec_id = name # f"SPEC_{name}_kw_only_{kw_only}" if hasattr(module, spec_id): - return getattr(module, spec_id) # type: ignore + MOD_SPEC: Type[BaseConfiguration] = getattr(module, spec_id) + return MOD_SPEC, MOD_SPEC.get_resolvable_fields() # find all the arguments that have following defaults literal_defaults: Dict[str, str] = None @@ -59,7 +75,8 @@ def dlt_config_literal_to_type(arg_name: str) -> AnyType: return None # synthesize configuration from the signature - fields: Dict[str, Any] = {} + new_fields: Dict[str, Any] = {} + sig_base_fields: Dict[str, Any] = {} annotations: Dict[str, Any] = {} for p in sig.parameters.values(): @@ -69,6 +86,10 @@ def dlt_config_literal_to_type(arg_name: str) -> AnyType: "cls", ]: field_type = AnyType if p.annotation == Parameter.empty else p.annotation + # keep the base fields if sig not annotated + if p.name in base_fields and field_type is AnyType and p.default is None: + sig_base_fields[p.name] = base_fields[p.name] + continue # only valid hints and parameters with defaults are eligible if is_valid_hint(field_type) and p.default != Parameter.empty: # try to get type from default @@ -99,18 +120,17 @@ def dlt_config_literal_to_type(arg_name: str) -> AnyType: # set annotations annotations[p.name] = field_type # set field with default value - fields[p.name] = p.default + new_fields[p.name] = p.default - if not fields: - return None + signature_fields = {**sig_base_fields, **new_fields} # new type goes to the module where sig was declared - fields["__module__"] = module.__name__ + new_fields["__module__"] = module.__name__ # set annotations so they are present in __dict__ - fields["__annotations__"] = annotations + new_fields["__annotations__"] = annotations # synthesize type - T: Type[BaseConfiguration] = type(name, (BaseConfiguration,), fields) + T: Type[BaseConfiguration] = type(name, (base,), new_fields) SPEC = configspec()(T) # add to the module setattr(module, spec_id, SPEC) - return SPEC + return SPEC, signature_fields diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 7f73bcbf36..96341ab8b4 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -98,7 +98,9 @@ def __init__( self.to_engine = to_engine super().__init__( f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}," - f" stopped at {from_engine}" + f" stopped at {from_engine}. You possibly tried to run an older dlt" + " version against a destination you have previously loaded data to with a newer dlt" + " version." ) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 4c81c8af72..92598fff44 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -82,7 +82,9 @@ class Schema: _imported_version_hash: str # version hash of recently imported schema _schema_description: str # optional schema description _schema_tables: TSchemaTables - _settings: TSchemaSettings # schema settings to hold default hints, preferred types and other settings + _settings: ( + TSchemaSettings # schema settings to hold default hints, preferred types and other settings + ) # list of preferred types: map regex on columns into types _compiled_preferred_types: List[Tuple[REPattern, TDataType]] @@ -551,14 +553,20 @@ def get_table_columns( if utils.is_complete_column(v) } - def data_tables(self, include_incomplete: bool = False) -> List[TTableSchema]: + def data_tables( + self, seen_data_only: bool = False, include_incomplete: bool = False + ) -> List[TTableSchema]: """Gets list of all tables, that hold the loaded data. Excludes dlt tables. Excludes incomplete tables (ie. without columns)""" return [ t for t in self._schema_tables.values() if not t["name"].startswith(self._dlt_tables_prefix) and ( - include_incomplete or len(self.get_table_columns(t["name"], include_incomplete)) > 0 + ( + include_incomplete + or len(self.get_table_columns(t["name"], include_incomplete)) > 0 + ) + and (not seen_data_only or utils.has_table_seen_data(t)) ) ] diff --git a/dlt/common/storages/exceptions.py b/dlt/common/storages/exceptions.py index 22d6dfaf79..f4288719c1 100644 --- a/dlt/common/storages/exceptions.py +++ b/dlt/common/storages/exceptions.py @@ -116,3 +116,11 @@ def __init__(self, schema_name: str, storage_path: str, stored_name: str) -> Non f"A schema file name '{schema_name}' in {storage_path} does not correspond to the name" f" of schema in the file {stored_name}" ) + + +class CurrentLoadPackageStateNotAvailable(StorageException): + def __init__(self) -> None: + super().__init__( + "State of the current load package is not available. Current load package state is" + " only available in a function decorated with @dlt.destination during loading." + ) diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index 63409aa878..bb66e28671 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -1,6 +1,8 @@ import contextlib import os from copy import deepcopy +import threading + import datetime # noqa: 251 import humanize from pathlib import Path @@ -17,23 +19,92 @@ Set, get_args, cast, + Any, + Tuple, + TYPE_CHECKING, + TypedDict, ) from dlt.common import pendulum, json + +from dlt.common.configuration import configspec +from dlt.common.configuration.specs import ContainerInjectableContext +from dlt.common.configuration.exceptions import ContextDefaultCannotBeCreated +from dlt.common.configuration.container import Container + from dlt.common.data_writers import DataWriter, new_file_id from dlt.common.destination import TLoaderFileFormat from dlt.common.exceptions import TerminalValueError from dlt.common.schema import Schema, TSchemaTables from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns from dlt.common.storages import FileStorage -from dlt.common.storages.exceptions import LoadPackageNotFound -from dlt.common.typing import DictStrAny, StrAny, SupportsHumanize +from dlt.common.storages.exceptions import LoadPackageNotFound, CurrentLoadPackageStateNotAvailable +from dlt.common.typing import DictStrAny, SupportsHumanize from dlt.common.utils import flatten_list_or_items +from dlt.common.versioned_state import ( + generate_state_version_hash, + bump_state_version_if_modified, + TVersionedState, + default_versioned_state, +) +from typing_extensions import NotRequired + + +class TLoadPackageState(TVersionedState, total=False): + created_at: str + """Timestamp when the loadpackage was created""" + + """A section of state that does not participate in change merging and version control""" + destination_state: NotRequired[Dict[str, Any]] + """private space for destinations to store state relevant only to the load package""" + + +class TLoadPackage(TypedDict, total=False): + load_id: str + """Load id""" + state: TLoadPackageState + """State of the load package""" + + +# allows to upgrade state when restored with a new version of state logic/schema +LOADPACKAGE_STATE_ENGINE_VERSION = 1 + + +def generate_loadpackage_state_version_hash(state: TLoadPackageState) -> str: + return generate_state_version_hash(state) + + +def bump_loadpackage_state_version_if_modified(state: TLoadPackageState) -> Tuple[int, str, str]: + return bump_state_version_if_modified(state) + + +def migrate_load_package_state( + state: DictStrAny, from_engine: int, to_engine: int +) -> TLoadPackageState: + # TODO: if you start adding new versions, we need proper tests for these migrations! + # NOTE: do not touch destinations state, it is not versioned + if from_engine == to_engine: + return cast(TLoadPackageState, state) + + # check state engine + if from_engine != to_engine: + raise Exception("No upgrade path for loadpackage state") + + state["_state_engine_version"] = from_engine + return cast(TLoadPackageState, state) + + +def default_load_package_state() -> TLoadPackageState: + return { + **default_versioned_state(), + "_state_engine_version": LOADPACKAGE_STATE_ENGINE_VERSION, + } + # folders to manage load jobs in a single load package TJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"] WORKING_FOLDERS: Set[TJobState] = set(get_args(TJobState)) -TLoadPackageState = Literal["new", "extracted", "normalized", "loaded", "aborted"] +TLoadPackageStatus = Literal["new", "extracted", "normalized", "loaded", "aborted"] class ParsedLoadJobFileName(NamedTuple): @@ -125,7 +196,7 @@ def __str__(self) -> str: class _LoadPackageInfo(NamedTuple): load_id: str package_path: str - state: TLoadPackageState + state: TLoadPackageStatus schema: Schema schema_update: TSchemaTables completed_at: datetime.datetime @@ -201,8 +272,11 @@ class PackageStorage: PACKAGE_COMPLETED_FILE_NAME = ( # completed package marker file, currently only to store data with os.stat "package_completed.json" ) + LOAD_PACKAGE_STATE_FILE_NAME = ( # internal state of the load package, will not be synced to the destination + "load_package_state.json" + ) - def __init__(self, storage: FileStorage, initial_state: TLoadPackageState) -> None: + def __init__(self, storage: FileStorage, initial_state: TLoadPackageStatus) -> None: """Creates storage that manages load packages with root at `storage` and initial package state `initial_state`""" self.storage = storage self.initial_state = initial_state @@ -334,8 +408,13 @@ def create_package(self, load_id: str) -> None: self.storage.create_folder(os.path.join(load_id, PackageStorage.COMPLETED_JOBS_FOLDER)) self.storage.create_folder(os.path.join(load_id, PackageStorage.FAILED_JOBS_FOLDER)) self.storage.create_folder(os.path.join(load_id, PackageStorage.STARTED_JOBS_FOLDER)) + # ensure created timestamp is set in state when load package is created + state = self.get_load_package_state(load_id) + if not state.get("created_at"): + state["created_at"] = pendulum.now().to_iso8601_string() + self.save_load_package_state(load_id, state) - def complete_loading_package(self, load_id: str, load_state: TLoadPackageState) -> str: + def complete_loading_package(self, load_id: str, load_state: TLoadPackageStatus) -> str: """Completes loading the package by writing marker file with`package_state. Returns path to the completed package""" load_path = self.get_package_path(load_id) # save marker file @@ -381,6 +460,36 @@ def save_schema_updates(self, load_id: str, schema_update: TSchemaTables) -> Non ) as f: json.dump(schema_update, f) + # + # Loadpackage state + # + def get_load_package_state(self, load_id: str) -> TLoadPackageState: + package_path = self.get_package_path(load_id) + if not self.storage.has_folder(package_path): + raise LoadPackageNotFound(load_id) + try: + state_dump = self.storage.load(self.get_load_package_state_path(load_id)) + state = json.loads(state_dump) + return migrate_load_package_state( + state, state["_state_engine_version"], LOADPACKAGE_STATE_ENGINE_VERSION + ) + except FileNotFoundError: + return default_load_package_state() + + def save_load_package_state(self, load_id: str, state: TLoadPackageState) -> None: + package_path = self.get_package_path(load_id) + if not self.storage.has_folder(package_path): + raise LoadPackageNotFound(load_id) + bump_loadpackage_state_version_if_modified(state) + self.storage.save( + self.get_load_package_state_path(load_id), + json.dumps(state), + ) + + def get_load_package_state_path(self, load_id: str) -> str: + package_path = self.get_package_path(load_id) + return os.path.join(package_path, PackageStorage.LOAD_PACKAGE_STATE_FILE_NAME) + # # Get package info # @@ -514,3 +623,59 @@ def filter_jobs_for_table( all_jobs: Iterable[LoadJobInfo], table_name: str ) -> Sequence[LoadJobInfo]: return [job for job in all_jobs if job.job_file_info.table_name == table_name] + + +@configspec +class LoadPackageStateInjectableContext(ContainerInjectableContext): + storage: PackageStorage + load_id: str + can_create_default: ClassVar[bool] = False + global_affinity: ClassVar[bool] = False + + def commit(self) -> None: + with self.state_save_lock: + self.storage.save_load_package_state(self.load_id, self.state) + + def on_resolved(self) -> None: + self.state_save_lock = threading.Lock() + self.state = self.storage.get_load_package_state(self.load_id) + + if TYPE_CHECKING: + + def __init__(self, load_id: str, storage: PackageStorage) -> None: ... + + +def load_package() -> TLoadPackage: + """Get full load package state present in current context. Across all threads this will be the same in memory dict.""" + container = Container() + # get injected state if present. injected load package state is typically "managed" so changes will be persisted + # if you need to save the load package state during a load, you need to call commit_load_package_state + try: + state_ctx = container[LoadPackageStateInjectableContext] + except ContextDefaultCannotBeCreated: + raise CurrentLoadPackageStateNotAvailable() + return TLoadPackage(state=state_ctx.state, load_id=state_ctx.load_id) + + +def commit_load_package_state() -> None: + """Commit load package state present in current context. This is thread safe.""" + container = Container() + try: + state_ctx = container[LoadPackageStateInjectableContext] + except ContextDefaultCannotBeCreated: + raise CurrentLoadPackageStateNotAvailable() + state_ctx.commit() + + +def destination_state() -> DictStrAny: + """Get segment of load package state that is specific to the current destination.""" + lp = load_package() + return lp["state"].setdefault("destination_state", {}) + + +def clear_destination_state(commit: bool = True) -> None: + """Clear segment of load package state that is specific to the current destination. Optionally commit to load package.""" + lp = load_package() + lp["state"].pop("destination_state", None) + if commit: + commit_load_package_state() diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index a83502cb9b..ffd55e7f29 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -1,6 +1,7 @@ from os.path import join from typing import Iterable, Optional, Sequence +from dlt.common.typing import DictStrAny from dlt.common import json from dlt.common.configuration import known_sections from dlt.common.configuration.inject import with_config @@ -18,6 +19,7 @@ PackageStorage, ParsedLoadJobFileName, TJobState, + TLoadPackageState, ) from dlt.common.storages.exceptions import JobWithUnsupportedWriterException, LoadPackageNotFound @@ -38,6 +40,11 @@ def __init__( supported_file_formats: Iterable[TLoaderFileFormat], config: LoadStorageConfiguration = config.value, ) -> None: + # puae-jsonl jobs have the extension .jsonl, so cater for this here + if supported_file_formats and "puae-jsonl" in supported_file_formats: + supported_file_formats = list(supported_file_formats) + supported_file_formats.append("jsonl") + if not LoadStorage.ALL_SUPPORTED_FILE_FORMATS.issuperset(supported_file_formats): raise TerminalValueError(supported_file_formats) if preferred_file_format and preferred_file_format not in supported_file_formats: @@ -79,7 +86,7 @@ def _get_data_item_path_template(self, load_id: str, _: str, table_name: str) -> def list_new_jobs(self, load_id: str) -> Sequence[str]: """Lists all jobs in new jobs folder of normalized package storage and checks if file formats are supported""" new_jobs = self.normalized_packages.list_new_jobs(load_id) - # # make sure all jobs have supported writers + # make sure all jobs have supported writers wrong_job = next( ( j @@ -184,3 +191,10 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: return self.loaded_packages.get_load_package_info(load_id) except LoadPackageNotFound: return self.normalized_packages.get_load_package_info(load_id) + + def get_load_package_state(self, load_id: str) -> TLoadPackageState: + """Gets state of normlized or loaded package with given load_id, all jobs and their statuses.""" + try: + return self.loaded_packages.get_load_package_state(load_id) + except LoadPackageNotFound: + return self.normalized_packages.get_load_package_state(load_id) diff --git a/dlt/common/storages/normalize_storage.py b/dlt/common/storages/normalize_storage.py index 8a247c2021..2b90b7c088 100644 --- a/dlt/common/storages/normalize_storage.py +++ b/dlt/common/storages/normalize_storage.py @@ -51,7 +51,9 @@ def list_files_to_normalize_sorted(self) -> Sequence[str]: [ file for file in files - if not file.endswith(PackageStorage.SCHEMA_FILE_NAME) and os.path.isfile(file) + if not file.endswith(PackageStorage.SCHEMA_FILE_NAME) + and os.path.isfile(file) + and not file.endswith(PackageStorage.LOAD_PACKAGE_STATE_FILE_NAME) ] ) diff --git a/dlt/common/validation.py b/dlt/common/validation.py index 6bf1356aeb..4b54d6a29e 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -1,5 +1,6 @@ import contextlib import functools +import inspect from typing import Callable, Any, Type from typing_extensions import get_type_hints, get_args @@ -38,11 +39,10 @@ def validate_dict( filter_f (TFilterFunc, optional): A function to filter keys in `doc`. It should return `True` for keys to be kept. Defaults to a function that keeps all keys. validator_f (TCustomValidator, optional): A function to perform additional validation - for types not covered by this function. It should return `True` if the validation passes. + for types not covered by this function. It should return `True` if the validation passes + or raise DictValidationException on validation error. For types it cannot validate, it + should return False to allow chaining. Defaults to a function that rejects all such types. - filter_required (TFilterFunc, optional): A function to filter out required fields, useful - for testing historic versions of dict that might now have certain fields yet. - Raises: DictValidationException: If there are missing required fields, unexpected fields, type mismatches or unvalidated types in `doc` compared to `spec`. @@ -162,8 +162,23 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: elif t is Any: # pass everything with any type pass + elif inspect.isclass(t) and isinstance(pv, t): + # allow instances of classes + pass else: + type_name = getattr(t, "__name__", str(t)) + pv_type_name = getattr(type(pv), "__name__", str(type(pv))) + # try to apply special validator if not validator_f(path, pk, pv, t): + # type `t` cannot be validated by validator_f + if inspect.isclass(t): + if not isinstance(pv, t): + raise DictValidationException( + f"In {path}: field {pk} expect class {type_name} but got instance of" + f" {pv_type_name}", + path, + pk, + ) # TODO: when Python 3.9 and earlier support is # dropped, just __name__ can be used type_name = getattr(t, "__name__", str(t)) diff --git a/dlt/common/versioned_state.py b/dlt/common/versioned_state.py new file mode 100644 index 0000000000..a051a6660c --- /dev/null +++ b/dlt/common/versioned_state.py @@ -0,0 +1,45 @@ +import base64 +import hashlib +from copy import copy + +import datetime # noqa: 251 +from dlt.common import json +from typing import TypedDict, Dict, Any, List, Tuple, cast + + +class TVersionedState(TypedDict, total=False): + _state_version: int + _version_hash: str + _state_engine_version: int + + +def generate_state_version_hash(state: TVersionedState, exclude_attrs: List[str] = None) -> str: + # generates hash out of stored schema content, excluding hash itself, version and local state + state_copy = copy(state) + exclude_attrs = exclude_attrs or [] + exclude_attrs.extend(["_state_version", "_state_engine_version", "_version_hash"]) + for attr in exclude_attrs: + state_copy.pop(attr, None) # type: ignore + content = json.typed_dumpb(state_copy, sort_keys=True) # type: ignore + h = hashlib.sha3_256(content) + return base64.b64encode(h.digest()).decode("ascii") + + +def bump_state_version_if_modified( + state: TVersionedState, exclude_attrs: List[str] = None +) -> Tuple[int, str, str]: + """Bumps the `state` version and version hash if content modified, returns (new version, new hash, old hash) tuple""" + hash_ = generate_state_version_hash(state, exclude_attrs) + previous_hash = state.get("_version_hash") + if not previous_hash: + # if hash was not set, set it without bumping the version, that's the initial state + pass + elif hash_ != previous_hash: + state["_state_version"] += 1 + + state["_version_hash"] = hash_ + return state["_state_version"], hash_, previous_hash + + +def default_versioned_state() -> TVersionedState: + return {"_state_version": 0, "_state_engine_version": 1} diff --git a/dlt/destinations/__init__.py b/dlt/destinations/__init__.py index c0a0b419c1..4a10deffc0 100644 --- a/dlt/destinations/__init__.py +++ b/dlt/destinations/__init__.py @@ -10,6 +10,7 @@ from dlt.destinations.impl.qdrant.factory import qdrant from dlt.destinations.impl.motherduck.factory import motherduck from dlt.destinations.impl.weaviate.factory import weaviate +from dlt.destinations.impl.destination.factory import destination from dlt.destinations.impl.synapse.factory import synapse from dlt.destinations.impl.databricks.factory import databricks @@ -29,4 +30,5 @@ "weaviate", "synapse", "databricks", + "destination", ] diff --git a/dlt/destinations/decorators.py b/dlt/destinations/decorators.py new file mode 100644 index 0000000000..62d059c4a6 --- /dev/null +++ b/dlt/destinations/decorators.py @@ -0,0 +1,96 @@ +import functools + +from typing import Any, Type, Optional, Callable, Union, cast +from typing_extensions import Concatenate +from dlt.common.typing import AnyFun + +from functools import wraps + +from dlt.common import logger +from dlt.destinations.impl.destination.factory import destination as _destination +from dlt.destinations.impl.destination.configuration import ( + TDestinationCallableParams, + CustomDestinationClientConfiguration, +) +from dlt.common.destination import TLoaderFileFormat +from dlt.common.destination.reference import Destination +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema + + +def destination( + func: Optional[AnyFun] = None, + /, + loader_file_format: TLoaderFileFormat = None, + batch_size: int = 10, + name: str = None, + naming_convention: str = "direct", + skip_dlt_columns_and_tables: bool = True, + max_table_nesting: int = 0, + spec: Type[CustomDestinationClientConfiguration] = None, +) -> Callable[ + [Callable[Concatenate[Union[TDataItems, str], TTableSchema, TDestinationCallableParams], Any]], + Callable[TDestinationCallableParams, _destination], +]: + """A decorator that transforms a function that takes two positional arguments "table" and "items" and any number of keyword arguments with defaults + into a callable that will create a custom destination. The function does not return anything, the keyword arguments can be configuration and secrets values. + + #### Example Usage with Configuration and Secrets: + + >>> @dlt.destination(batch_size=100, loader_file_format="parquet") + >>> def my_destination(items, table, api_url: str = dlt.config.value, api_secret = dlt.secrets.value): + >>> print(table["name"]) + >>> print(items) + >>> + >>> p = dlt.pipeline("chess_pipeline", destination=my_destination) + + Here all incoming data will be sent to the destination function with the items in the requested format and the dlt table schema. + The config and secret values will be resolved from the path destination.my_destination.api_url and destination.my_destination.api_secret. + + #### Args: + batch_size: defines how many items per function call are batched together and sent as an array. If you set a batch-size of 0, instead of passing in actual dataitems, you will receive one call per load job with the path of the file as the items argument. You can then open and process that file in any way you like. + loader_file_format: defines in which format files are stored in the load package before being sent to the destination function, this can be puae-jsonl or parquet. + name: defines the name of the destination that get's created by the destination decorator, defaults to the name of the function + naming_convention: defines the name of the destination that gets created by the destination decorator. This controls how table and column names are normalized. The default is direct which will keep all names the same. + max_nesting_level: defines how deep the normalizer will go to normalize complex fields on your data to create subtables. This overwrites any settings on your source and is set to zero to not create any nested tables by default. + skip_dlt_columns_and_tables: defines wether internal tables and columns will be fed into the custom destination function. This is set to True by default. + spec: defines a configuration spec that will be used to to inject arguments into the decorated functions. Argument not in spec will not be injected + + Returns: + A callable that can be used to create a dlt custom destination instance + """ + + def decorator( + destination_callable: Callable[ + Concatenate[Union[TDataItems, str], TTableSchema, TDestinationCallableParams], Any + ] + ) -> Callable[TDestinationCallableParams, _destination]: + @wraps(destination_callable) + def wrapper( + *args: TDestinationCallableParams.args, **kwargs: TDestinationCallableParams.kwargs + ) -> _destination: + if args: + logger.warning( + "Ignoring positional arguments for destination callable %s", + destination_callable, + ) + return _destination( + spec=spec, + destination_callable=destination_callable, + loader_file_format=loader_file_format, + batch_size=batch_size, + destination_name=name, + naming_convention=naming_convention, + skip_dlt_columns_and_tables=skip_dlt_columns_and_tables, + max_table_nesting=max_table_nesting, + **kwargs, # type: ignore + ) + + return wrapper + + if func is None: + # we're called with parens. + return decorator + + # we're called as @source without parens. + return decorator(func) # type: ignore diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 9d79d8bf55..b323832418 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -37,7 +37,7 @@ from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition, TTableFormat from dlt.common.schema.utils import table_schema_has_type, get_table_format from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import LoadJob, FollowupJob +from dlt.common.destination.reference import LoadJob, DoNothingFollowupJob, DoNothingJob from dlt.common.destination.reference import TLoadJobState, NewLoadJob, SupportsStagingDestination from dlt.common.storages import FileStorage from dlt.common.data_writers.escape import escape_bigquery_identifier @@ -149,27 +149,6 @@ def __init__(self) -> None: DLTAthenaFormatter._INSTANCE = self -class DoNothingJob(LoadJob): - """The most lazy class of dlt""" - - def __init__(self, file_path: str) -> None: - super().__init__(FileStorage.get_file_name_from_file_path(file_path)) - - def state(self) -> TLoadJobState: - # this job is always done - return "completed" - - def exception(self) -> str: - # this part of code should be never reached - raise NotImplementedError() - - -class DoNothingFollowupJob(DoNothingJob, FollowupJob): - """The second most lazy class of dlt""" - - pass - - class AthenaSQLClient(SqlClientBase[Connection]): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() dbapi: ClassVar[DBApi] = pyathena diff --git a/dlt/destinations/impl/destination/__init__.py b/dlt/destinations/impl/destination/__init__.py new file mode 100644 index 0000000000..560c9d4eda --- /dev/null +++ b/dlt/destinations/impl/destination/__init__.py @@ -0,0 +1,17 @@ +from typing import Optional +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.data_writers import TLoaderFileFormat + + +def capabilities( + preferred_loader_file_format: TLoaderFileFormat = "puae-jsonl", + naming_convention: str = "direct", + max_table_nesting: Optional[int] = 0, +) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext.generic_capabilities(preferred_loader_file_format) + caps.supported_loader_file_formats = ["puae-jsonl", "parquet"] + caps.supports_ddl_transactions = False + caps.supports_transactions = False + caps.naming_convention = naming_convention + caps.max_table_nesting = max_table_nesting + return caps diff --git a/dlt/destinations/impl/destination/configuration.py b/dlt/destinations/impl/destination/configuration.py new file mode 100644 index 0000000000..f123ba69b3 --- /dev/null +++ b/dlt/destinations/impl/destination/configuration.py @@ -0,0 +1,34 @@ +from typing import TYPE_CHECKING, Optional, Final, Callable, Union, Any +from typing_extensions import ParamSpec + +from dlt.common.configuration import configspec +from dlt.common.destination import TLoaderFileFormat +from dlt.common.destination.reference import ( + DestinationClientConfiguration, +) +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema +from dlt.common.destination import Destination + +TDestinationCallable = Callable[[Union[TDataItems, str], TTableSchema], None] +TDestinationCallableParams = ParamSpec("TDestinationCallableParams") + + +@configspec +class CustomDestinationClientConfiguration(DestinationClientConfiguration): + destination_type: Final[str] = "destination" # type: ignore + destination_callable: Optional[Union[str, TDestinationCallable]] = None # noqa: A003 + loader_file_format: TLoaderFileFormat = "puae-jsonl" + batch_size: int = 10 + skip_dlt_columns_and_tables: bool = True + max_table_nesting: int = 0 + + if TYPE_CHECKING: + + def __init__( + self, + *, + loader_file_format: TLoaderFileFormat = "puae-jsonl", + batch_size: int = 10, + destination_callable: Union[TDestinationCallable, str] = None, + ) -> None: ... diff --git a/dlt/destinations/impl/destination/destination.py b/dlt/destinations/impl/destination/destination.py new file mode 100644 index 0000000000..4a3cabde34 --- /dev/null +++ b/dlt/destinations/impl/destination/destination.py @@ -0,0 +1,212 @@ +from abc import ABC, abstractmethod +from types import TracebackType +from typing import ClassVar, Dict, Optional, Type, Iterable, Iterable, cast, Dict, List +from copy import deepcopy + +from dlt.common.destination.reference import LoadJob +from dlt.destinations.job_impl import EmptyLoadJob +from dlt.common.typing import TDataItems, AnyFun +from dlt.common import json +from dlt.pipeline.current import ( + destination_state, + commit_load_package_state, +) +from dlt.common.configuration import create_resolved_partial + +from dlt.common.schema import Schema, TTableSchema, TSchemaTables +from dlt.common.schema.typing import TTableSchema +from dlt.common.storages import FileStorage +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.reference import ( + TLoadJobState, + LoadJob, + DoNothingJob, + JobClientBase, +) + +from dlt.destinations.impl.destination import capabilities +from dlt.destinations.impl.destination.configuration import ( + CustomDestinationClientConfiguration, + TDestinationCallable, +) + + +class DestinationLoadJob(LoadJob, ABC): + def __init__( + self, + table: TTableSchema, + file_path: str, + config: CustomDestinationClientConfiguration, + schema: Schema, + destination_state: Dict[str, int], + destination_callable: TDestinationCallable, + skipped_columns: List[str], + ) -> None: + super().__init__(FileStorage.get_file_name_from_file_path(file_path)) + self._file_path = file_path + self._config = config + self._table = table + self._schema = schema + # we create pre_resolved callable here + self._callable = destination_callable + self._state: TLoadJobState = "running" + self._storage_id = f"{self._parsed_file_name.table_name}.{self._parsed_file_name.file_id}" + self.skipped_columns = skipped_columns + try: + if self._config.batch_size == 0: + # on batch size zero we only call the callable with the filename + self.call_callable_with_items(self._file_path) + else: + current_index = destination_state.get(self._storage_id, 0) + for batch in self.run(current_index): + self.call_callable_with_items(batch) + current_index += len(batch) + destination_state[self._storage_id] = current_index + + self._state = "completed" + except Exception as e: + self._state = "retry" + raise e + finally: + # save progress + commit_load_package_state() + + @abstractmethod + def run(self, start_index: int) -> Iterable[TDataItems]: + pass + + def call_callable_with_items(self, items: TDataItems) -> None: + if not items: + return + # call callable + self._callable(items, self._table) + + def state(self) -> TLoadJobState: + return self._state + + def exception(self) -> str: + raise NotImplementedError() + + +class DestinationParquetLoadJob(DestinationLoadJob): + def run(self, start_index: int) -> Iterable[TDataItems]: + # stream items + from dlt.common.libs.pyarrow import pyarrow + + # guard against changed batch size after restart of loadjob + assert ( + start_index % self._config.batch_size + ) == 0, "Batch size was changed during processing of one load package" + + # on record batches we cannot drop columns, we need to + # select the ones we want to keep + keep_columns = list(self._table["columns"].keys()) + start_batch = start_index / self._config.batch_size + with pyarrow.parquet.ParquetFile(self._file_path) as reader: + for record_batch in reader.iter_batches( + batch_size=self._config.batch_size, columns=keep_columns + ): + if start_batch > 0: + start_batch -= 1 + continue + yield record_batch + + +class DestinationJsonlLoadJob(DestinationLoadJob): + def run(self, start_index: int) -> Iterable[TDataItems]: + current_batch: TDataItems = [] + + # stream items + with FileStorage.open_zipsafe_ro(self._file_path) as f: + encoded_json = json.typed_loads(f.read()) + + for item in encoded_json: + # find correct start position + if start_index > 0: + start_index -= 1 + continue + # skip internal columns + for column in self.skipped_columns: + item.pop(column, None) + current_batch.append(item) + if len(current_batch) == self._config.batch_size: + yield current_batch + current_batch = [] + yield current_batch + + +class DestinationClient(JobClientBase): + """Sink Client""" + + capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() + + def __init__(self, schema: Schema, config: CustomDestinationClientConfiguration) -> None: + super().__init__(schema, config) + self.config: CustomDestinationClientConfiguration = config + # create pre-resolved callable to avoid multiple config resolutions during execution of the jobs + self.destination_callable = create_resolved_partial( + cast(AnyFun, self.config.destination_callable), self.config + ) + + def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: + pass + + def is_storage_initialized(self) -> bool: + return True + + def drop_storage(self) -> None: + pass + + def update_stored_schema( + self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None + ) -> Optional[TSchemaTables]: + return super().update_stored_schema(only_tables, expected_update) + + def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: + # skip internal tables and remove columns from schema if so configured + skipped_columns: List[str] = [] + if self.config.skip_dlt_columns_and_tables: + if table["name"].startswith(self.schema._dlt_tables_prefix): + return DoNothingJob(file_path) + table = deepcopy(table) + for column in list(table["columns"].keys()): + if column.startswith(self.schema._dlt_tables_prefix): + table["columns"].pop(column) + skipped_columns.append(column) + + # save our state in destination name scope + load_state = destination_state() + if file_path.endswith("parquet"): + return DestinationParquetLoadJob( + table, + file_path, + self.config, + self.schema, + load_state, + self.destination_callable, + skipped_columns, + ) + if file_path.endswith("jsonl"): + return DestinationJsonlLoadJob( + table, + file_path, + self.config, + self.schema, + load_state, + self.destination_callable, + skipped_columns, + ) + return None + + def restore_file_load(self, file_path: str) -> LoadJob: + return EmptyLoadJob.from_file_path(file_path, "completed") + + def complete_load(self, load_id: str) -> None: ... + + def __enter__(self) -> "DestinationClient": + return self + + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType + ) -> None: + pass diff --git a/dlt/destinations/impl/destination/factory.py b/dlt/destinations/impl/destination/factory.py new file mode 100644 index 0000000000..7cca8f2202 --- /dev/null +++ b/dlt/destinations/impl/destination/factory.py @@ -0,0 +1,144 @@ +import typing as t +import inspect +from importlib import import_module + +from types import ModuleType +from dlt.common.typing import AnyFun + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.exceptions import DestinationTransientException +from dlt.common.configuration import known_sections, with_config, get_fun_spec +from dlt.common.configuration.exceptions import ConfigurationValueError +from dlt.common import logger + +from dlt.destinations.impl.destination.configuration import ( + CustomDestinationClientConfiguration, + TDestinationCallable, +) +from dlt.destinations.impl.destination import capabilities +from dlt.common.data_writers import TLoaderFileFormat +from dlt.common.utils import get_callable_name, is_inner_callable + +if t.TYPE_CHECKING: + from dlt.destinations.impl.destination.destination import DestinationClient + + +class DestinationInfo(t.NamedTuple): + """Runtime information on a discovered destination""" + + SPEC: t.Type[CustomDestinationClientConfiguration] + f: AnyFun + module: ModuleType + + +_DESTINATIONS: t.Dict[str, DestinationInfo] = {} +"""A registry of all the decorated destinations""" + + +class destination(Destination[CustomDestinationClientConfiguration, "DestinationClient"]): + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities( + preferred_loader_file_format=self.config_params.get("loader_file_format", "puae-jsonl"), + naming_convention=self.config_params.get("naming_convention", "direct"), + max_table_nesting=self.config_params.get("max_table_nesting", None), + ) + + @property + def spec(self) -> t.Type[CustomDestinationClientConfiguration]: + """A spec of destination configuration resolved from the sink function signature""" + return self._spec + + @property + def client_class(self) -> t.Type["DestinationClient"]: + from dlt.destinations.impl.destination.destination import DestinationClient + + return DestinationClient + + def __init__( + self, + destination_callable: t.Union[TDestinationCallable, str] = None, # noqa: A003 + destination_name: t.Optional[str] = None, + environment: t.Optional[str] = None, + loader_file_format: TLoaderFileFormat = None, + batch_size: int = 10, + naming_convention: str = "direct", + spec: t.Type[CustomDestinationClientConfiguration] = None, + **kwargs: t.Any, + ) -> None: + if spec and not issubclass(spec, CustomDestinationClientConfiguration): + raise ValueError( + "A SPEC for a sink destination must use CustomDestinationClientConfiguration as a" + " base." + ) + # resolve callable + if callable(destination_callable): + pass + elif destination_callable: + try: + module_path, attr_name = destination_callable.rsplit(".", 1) + dest_module = import_module(module_path) + except ModuleNotFoundError as e: + raise ConfigurationValueError( + f"Could not find callable module at {module_path}" + ) from e + try: + destination_callable = getattr(dest_module, attr_name) + except AttributeError as e: + raise ConfigurationValueError( + f"Could not find callable function at {destination_callable}" + ) from e + + # provide dummy callable for cases where no callable is provided + # this is needed for cli commands to work + if not destination_callable: + logger.warning( + "No destination callable provided, providing dummy callable which will fail on" + " load." + ) + + def dummy_callable(*args: t.Any, **kwargs: t.Any) -> None: + raise DestinationTransientException( + "You tried to load to a custom destination without a valid callable." + ) + + destination_callable = dummy_callable + + elif not callable(destination_callable): + raise ConfigurationValueError("Resolved Sink destination callable is not a callable.") + + # resolve destination name + if destination_name is None: + destination_name = get_callable_name(destination_callable) + func_module = inspect.getmodule(destination_callable) + + # build destination spec + destination_sections = (known_sections.DESTINATION, destination_name) + conf_callable = with_config( + destination_callable, + spec=spec, + sections=destination_sections, + include_defaults=True, + base=None if spec else CustomDestinationClientConfiguration, + ) + + # save destination in registry + resolved_spec = t.cast( + t.Type[CustomDestinationClientConfiguration], get_fun_spec(conf_callable) + ) + # register only standalone destinations, no inner + if not is_inner_callable(destination_callable): + _DESTINATIONS[destination_callable.__qualname__] = DestinationInfo( + resolved_spec, destination_callable, func_module + ) + + # remember spec + self._spec = resolved_spec or spec + super().__init__( + destination_name=destination_name, + environment=environment, + loader_file_format=loader_file_format, + batch_size=batch_size, + naming_convention=naming_convention, + destination_callable=conf_callable, + **kwargs, + ) diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py index 695f1a0972..9d872a238e 100644 --- a/dlt/destinations/sql_client.py +++ b/dlt/destinations/sql_client.py @@ -221,7 +221,7 @@ def _get_columns(self) -> List[str]: return [c[0] for c in self.native_cursor.description] def df(self, chunk_size: int = None, **kwargs: Any) -> Optional[DataFrame]: - from dlt.common.libs.pandas import _wrap_result + from dlt.common.libs.pandas_sql import _wrap_result columns = self._get_columns() if chunk_size is None: diff --git a/dlt/extract/__init__.py b/dlt/extract/__init__.py index 78e246cd46..03b2e59539 100644 --- a/dlt/extract/__init__.py +++ b/dlt/extract/__init__.py @@ -4,6 +4,7 @@ from dlt.extract.decorators import source, resource, transformer, defer from dlt.extract.incremental import Incremental from dlt.extract.wrappers import wrap_additional_type +from dlt.extract.extractors import materialize_schema_item __all__ = [ "DltResource", @@ -17,4 +18,5 @@ "defer", "Incremental", "wrap_additional_type", + "materialize_schema_item", ] diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 2ff813a2de..3b3d0704ea 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -244,6 +244,48 @@ def _compute_metrics(self, load_id: str, source: DltSource) -> ExtractMetrics: "hints": clean_hints, } + def _write_empty_files( + self, source: DltSource, extractors: Dict[TLoaderFileFormat, Extractor] + ) -> None: + schema = source.schema + json_extractor = extractors["puae-jsonl"] + resources_with_items = set().union(*[e.resources_with_items for e in extractors.values()]) + # find REPLACE resources that did not yield any pipe items and create empty jobs for them + # NOTE: do not include tables that have never seen data + data_tables = {t["name"]: t for t in schema.data_tables(seen_data_only=True)} + tables_by_resources = utils.group_tables_by_resource(data_tables) + for resource in source.resources.selected.values(): + if resource.write_disposition != "replace" or resource.name in resources_with_items: + continue + if resource.name not in tables_by_resources: + continue + for table in tables_by_resources[resource.name]: + # we only need to write empty files for the top tables + if not table.get("parent", None): + json_extractor.write_empty_items_file(table["name"]) + + # collect resources that received empty materialized lists and had no items + resources_with_empty = ( + set() + .union(*[e.resources_with_empty for e in extractors.values()]) + .difference(resources_with_items) + ) + # get all possible tables + data_tables = {t["name"]: t for t in schema.data_tables()} + tables_by_resources = utils.group_tables_by_resource(data_tables) + for resource_name in resources_with_empty: + if resource := source.resources.selected.get(resource_name): + if tables := tables_by_resources.get("resource_name"): + # write empty tables + for table in tables: + # we only need to write empty files for the top tables + if not table.get("parent", None): + json_extractor.write_empty_items_file(table["name"]) + else: + table_name = json_extractor._get_static_table_name(resource, None) + if table_name: + json_extractor.write_empty_items_file(table_name) + def _extract_single_source( self, load_id: str, @@ -255,14 +297,11 @@ def _extract_single_source( ) -> None: schema = source.schema collector = self.collector - resources_with_items: Set[str] = set() extractors: Dict[TLoaderFileFormat, Extractor] = { "puae-jsonl": JsonLExtractor( - load_id, self.extract_storage, schema, resources_with_items, collector=collector - ), - "arrow": ArrowExtractor( - load_id, self.extract_storage, schema, resources_with_items, collector=collector + load_id, self.extract_storage, schema, collector=collector ), + "arrow": ArrowExtractor(load_id, self.extract_storage, schema, collector=collector), } last_item_format: Optional[TLoaderFileFormat] = None @@ -294,23 +333,7 @@ def _extract_single_source( extractors[item_format].write_items(resource, pipe_item.item, pipe_item.meta) last_item_format = item_format - # find defined resources that did not yield any pipeitems and create empty jobs for them - # NOTE: do not include incomplete tables. those tables have never seen data so we do not need to reset them - data_tables = {t["name"]: t for t in schema.data_tables(include_incomplete=False)} - tables_by_resources = utils.group_tables_by_resource(data_tables) - for resource in source.resources.selected.values(): - if ( - resource.write_disposition != "replace" - or resource.name in resources_with_items - ): - continue - if resource.name not in tables_by_resources: - continue - for table in tables_by_resources[resource.name]: - # we only need to write empty files for the top tables - if not table.get("parent", None): - extractors["puae-jsonl"].write_empty_items_file(table["name"]) - + self._write_empty_files(source, extractors) if left_gens > 0: # go to 100% collector.update("Resources", left_gens) diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 84abb4f3a8..52ecd66920 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -1,5 +1,5 @@ from copy import copy -from typing import Set, Dict, Any, Optional, Set +from typing import Set, Dict, Any, Optional, List from dlt.common import logger from dlt.common.configuration.inject import with_config @@ -29,13 +29,25 @@ from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem except MissingDependencyException: pyarrow = None + pa = None try: - from dlt.common.libs.pandas import pandas + from dlt.common.libs.pandas import pandas, pandas_to_arrow except MissingDependencyException: pandas = None +class MaterializedEmptyList(List[Any]): + """A list variant that will materialize tables even if empty list was yielded""" + + pass + + +def materialize_schema_item() -> MaterializedEmptyList: + """Yield this to materialize schema in the destination, even if there's no data.""" + return MaterializedEmptyList() + + class Extractor: file_format: TLoaderFileFormat @@ -49,7 +61,6 @@ def __init__( load_id: str, storage: ExtractStorage, schema: Schema, - resources_with_items: Set[str], collector: Collector = NULL_COLLECTOR, *, _caps: DestinationCapabilitiesContext = None, @@ -57,7 +68,10 @@ def __init__( self.schema = schema self.naming = schema.naming self.collector = collector - self.resources_with_items = resources_with_items + self.resources_with_items: Set[str] = set() + """Tracks resources that received items""" + self.resources_with_empty: Set[str] = set() + """Track resources that received empty materialized list""" self.load_id = load_id self._table_contracts: Dict[str, TSchemaContractDict] = {} self._filtered_tables: Set[str] = set() @@ -130,6 +144,9 @@ def _write_item( self.collector.update(table_name, inc=new_rows_count) if new_rows_count > 0: self.resources_with_items.add(resource_name) + else: + if isinstance(items, MaterializedEmptyList): + self.resources_with_empty.add(resource_name) def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> None: if not isinstance(items, list): @@ -224,7 +241,7 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No for tbl in ( ( # 1. Convert pandas frame(s) to arrow Table - pa.Table.from_pandas(item) + pandas_to_arrow(item) if (pandas and isinstance(item, pandas.DataFrame)) else item ) @@ -295,7 +312,6 @@ def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTa # issue warnings when overriding computed with arrow for col_name, column in arrow_table["columns"].items(): if src_column := computed_table["columns"].get(col_name): - print(src_column) for hint_name, hint in column.items(): if (src_hint := src_column.get(hint_name)) is not None: if src_hint != hint: diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index f298e414a1..54ce00a806 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -82,6 +82,8 @@ def make_hints( ) if not table_name: new_template.pop("name") + if not write_disposition and "write_disposition" in new_template: + new_template.pop("write_disposition") # remember original columns if columns is not None: new_template["original_columns"] = columns @@ -197,10 +199,11 @@ def apply_hints( """ if not self._hints: # if there is no template yet, create and set a new one. + default_wd = None if parent_table_name else DEFAULT_WRITE_DISPOSITION t = make_hints( table_name, parent_table_name, - write_disposition, + write_disposition or default_wd, columns, primary_key, merge_key, diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 54e8b3d447..e74e87d094 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -6,6 +6,7 @@ import inspect from functools import wraps + import dlt from dlt.common.exceptions import MissingDependencyException from dlt.common import pendulum, logger @@ -163,11 +164,12 @@ def _make_transforms(self) -> None: self._transformers[dt] = kls( self.resource_name, self.cursor_path, + self.initial_value, self.start_value, self.end_value, - self._cached_state, self.last_value_func, self._primary_key, + set(self._cached_state["unique_hashes"]), ) @classmethod @@ -453,14 +455,28 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: return rows transformer = self._get_transformer(rows) - if isinstance(rows, list): - return [ + rows = [ item for item in (self._transform_item(transformer, row) for row in rows) if item is not None ] - return self._transform_item(transformer, rows) + else: + rows = self._transform_item(transformer, rows) + + # write back state + self._cached_state["last_value"] = transformer.last_value + if not transformer.deduplication_disabled: + # compute hashes for new last rows + unique_hashes = set( + transformer.compute_unique_value(row, self.primary_key) + for row in transformer.last_rows + ) + # add directly computed hashes + unique_hashes.update(transformer.unique_hashes) + self._cached_state["unique_hashes"] = list(unique_hashes) + + return rows Incremental.EMPTY = Incremental[Any]("") diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index e20617cf63..29b20de7b8 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -1,24 +1,23 @@ from datetime import datetime, date # noqa: I251 -from typing import Any, Optional, Tuple, List +from typing import Any, Optional, Set, Tuple, List from dlt.common.exceptions import MissingDependencyException from dlt.common.utils import digest128 from dlt.common.json import json from dlt.common import pendulum -from dlt.common.typing import TDataItem, TDataItems -from dlt.common.jsonpath import TJsonPath, find_values, JSONPathFields, compile_path +from dlt.common.typing import TDataItem +from dlt.common.jsonpath import find_values, JSONPathFields, compile_path from dlt.extract.incremental.exceptions import ( IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing, ) -from dlt.extract.incremental.typing import IncrementalColumnState, TCursorValue, LastValueFunc +from dlt.extract.incremental.typing import TCursorValue, LastValueFunc from dlt.extract.utils import resolve_column_value from dlt.extract.items import TTableHintTemplate from dlt.common.schema.typing import TColumnNames try: from dlt.common.libs import pyarrow - from dlt.common.libs.pandas import pandas from dlt.common.libs.numpy import numpy from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem from dlt.common.libs.pyarrow import from_arrow_scalar, to_arrow_scalar @@ -26,6 +25,11 @@ pa = None pyarrow = None numpy = None + +# NOTE: always import pandas independently from pyarrow +try: + from dlt.common.libs.pandas import pandas, pandas_to_arrow +except MissingDependencyException: pandas = None @@ -34,19 +38,24 @@ def __init__( self, resource_name: str, cursor_path: str, + initial_value: Optional[TCursorValue], start_value: Optional[TCursorValue], end_value: Optional[TCursorValue], - incremental_state: IncrementalColumnState, last_value_func: LastValueFunc[TCursorValue], primary_key: Optional[TTableHintTemplate[TColumnNames]], + unique_hashes: Set[str], ) -> None: self.resource_name = resource_name self.cursor_path = cursor_path + self.initial_value = initial_value self.start_value = start_value + self.last_value = start_value self.end_value = end_value - self.incremental_state = incremental_state + self.last_rows: List[TDataItem] = [] self.last_value_func = last_value_func self.primary_key = primary_key + self.unique_hashes = unique_hashes + self.start_unique_hashes = set(unique_hashes) # compile jsonpath self._compiled_cursor_path = compile_path(cursor_path) @@ -59,20 +68,17 @@ def __init__( self.cursor_path = self._compiled_cursor_path.fields[0] self._compiled_cursor_path = None - def __call__( - self, - row: TDataItem, - ) -> Tuple[bool, bool, bool]: ... - - -class JsonIncremental(IncrementalTransform): - def unique_value( + def compute_unique_value( self, row: TDataItem, primary_key: Optional[TTableHintTemplate[TColumnNames]], - resource_name: str, ) -> str: try: + assert not self.deduplication_disabled, ( + f"{self.resource_name}: Attempt to compute unique values when deduplication is" + " disabled" + ) + if primary_key: return digest128(json.dumps(resolve_column_value(primary_key, row), sort_keys=True)) elif primary_key is None: @@ -80,8 +86,20 @@ def unique_value( else: return None except KeyError as k_err: - raise IncrementalPrimaryKeyMissing(resource_name, k_err.args[0], row) + raise IncrementalPrimaryKeyMissing(self.resource_name, k_err.args[0], row) + def __call__( + self, + row: TDataItem, + ) -> Tuple[bool, bool, bool]: ... + + @property + def deduplication_disabled(self) -> bool: + """Skip deduplication when length of the key is 0""" + return isinstance(self.primary_key, (list, tuple)) and len(self.primary_key) == 0 + + +class JsonIncremental(IncrementalTransform): def find_cursor_value(self, row: TDataItem) -> Any: """Finds value in row at cursor defined by self.cursor_path. @@ -113,7 +131,8 @@ def __call__( return row, False, False row_value = self.find_cursor_value(row) - last_value = self.incremental_state["last_value"] + last_value = self.last_value + last_value_func = self.last_value_func # For datetime cursor, ensure the value is a timezone aware datetime. # The object saved in state will always be a tz aware pendulum datetime so this ensures values are comparable @@ -128,41 +147,45 @@ def __call__( # Check whether end_value has been reached # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value if self.end_value is not None and ( - self.last_value_func((row_value, self.end_value)) != self.end_value - or self.last_value_func((row_value,)) == self.end_value + last_value_func((row_value, self.end_value)) != self.end_value + or last_value_func((row_value,)) == self.end_value ): return None, False, True check_values = (row_value,) + ((last_value,) if last_value is not None else ()) - new_value = self.last_value_func(check_values) + new_value = last_value_func(check_values) + # new_value is "less" or equal to last_value (the actual max) if last_value == new_value: - processed_row_value = self.last_value_func((row_value,)) - # we store row id for all records with the current "last_value" in state and use it to deduplicate - - if processed_row_value == last_value: - unique_value = self.unique_value(row, self.primary_key, self.resource_name) - # if unique value exists then use it to deduplicate - if unique_value: - if unique_value in self.incremental_state["unique_hashes"]: - return None, False, False - # add new hash only if the record row id is same as current last value - self.incremental_state["unique_hashes"].append(unique_value) - return row, False, False - # skip the record that is not a last_value or new_value: that record was already processed + # use func to compute row_value into last_value compatible + processed_row_value = last_value_func((row_value,)) + # skip the record that is not a start_value or new_value: that record was already processed check_values = (row_value,) + ( (self.start_value,) if self.start_value is not None else () ) - new_value = self.last_value_func(check_values) + new_value = last_value_func(check_values) # Include rows == start_value but exclude "lower" - if new_value == self.start_value and processed_row_value != self.start_value: - return None, True, False - else: - return row, False, False + # new_value is "less" or equal to start_value (the initial max) + if new_value == self.start_value: + # if equal there's still a chance that item gets in + if processed_row_value == self.start_value: + if not self.deduplication_disabled: + unique_value = self.compute_unique_value(row, self.primary_key) + # if unique value exists then use it to deduplicate + if unique_value in self.start_unique_hashes: + return None, True, False + else: + # smaller than start value: gets out + return None, True, False + + # we store row id for all records with the current "last_value" in state and use it to deduplicate + if processed_row_value == last_value: + # add new hash only if the record row id is same as current last value + self.last_rows.append(row) else: - self.incremental_state["last_value"] = new_value - unique_value = self.unique_value(row, self.primary_key, self.resource_name) - if unique_value: - self.incremental_state["unique_hashes"] = [unique_value] + self.last_value = new_value + # store rows with "max" values to compute hashes after processing full batch + self.last_rows = [row] + self.unique_hashes = set() return row, False, False @@ -170,21 +193,25 @@ def __call__( class ArrowIncremental(IncrementalTransform): _dlt_index = "_dlt_index" - def unique_values( - self, item: "TAnyArrowItem", unique_columns: List[str], resource_name: str + def compute_unique_values(self, item: "TAnyArrowItem", unique_columns: List[str]) -> List[str]: + if not unique_columns: + return [] + rows = item.select(unique_columns).to_pylist() + return [self.compute_unique_value(row, self.primary_key) for row in rows] + + def compute_unique_values_with_index( + self, item: "TAnyArrowItem", unique_columns: List[str] ) -> List[Tuple[int, str]]: if not unique_columns: return [] - item = item indices = item[self._dlt_index].to_pylist() rows = item.select(unique_columns).to_pylist() return [ - (index, digest128(json.dumps(row, sort_keys=True))) for index, row in zip(indices, rows) + (index, self.compute_unique_value(row, self.primary_key)) + for index, row in zip(indices, rows) ] - def _deduplicate( - self, tbl: "pa.Table", unique_columns: Optional[List[str]], aggregate: str, cursor_path: str - ) -> "pa.Table": + def _add_unique_index(self, tbl: "pa.Table") -> "pa.Table": """Creates unique index if necessary.""" # create unique index if necessary if self._dlt_index not in tbl.schema.names: @@ -197,7 +224,7 @@ def __call__( ) -> Tuple[TDataItem, bool, bool]: is_pandas = pandas is not None and isinstance(tbl, pandas.DataFrame) if is_pandas: - tbl = pa.Table.from_pandas(tbl) + tbl = pandas_to_arrow(tbl) primary_key = self.primary_key(tbl) if callable(self.primary_key) else self.primary_key if primary_key: @@ -215,24 +242,18 @@ def __call__( self._dlt_index = primary_key elif primary_key is None: unique_columns = tbl.schema.names - else: # deduplicating is disabled - unique_columns = None start_out_of_range = end_out_of_range = False if not tbl: # row is None or empty arrow table return tbl, start_out_of_range, end_out_of_range - last_value = self.incremental_state["last_value"] - if self.last_value_func is max: compute = pa.compute.max - aggregate = "max" end_compare = pa.compute.less last_value_compare = pa.compute.greater_equal new_value_compare = pa.compute.greater elif self.last_value_func is min: compute = pa.compute.min - aggregate = "min" end_compare = pa.compute.greater last_value_compare = pa.compute.less_equal new_value_compare = pa.compute.less @@ -267,64 +288,56 @@ def __call__( # NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py() - if last_value is not None: - if self.start_value is not None: - # Remove rows lower than the last start value - keep_filter = last_value_compare( - tbl[cursor_path], to_arrow_scalar(self.start_value, cursor_data_type) + if self.start_value is not None: + start_value_scalar = to_arrow_scalar(self.start_value, cursor_data_type) + # Remove rows lower or equal than the last start value + keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar) + start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) + tbl = tbl.filter(keep_filter) + if not self.deduplication_disabled: + # Deduplicate after filtering old values + tbl = self._add_unique_index(tbl) + # Remove already processed rows where the cursor is equal to the start value + eq_rows = tbl.filter(pa.compute.equal(tbl[cursor_path], start_value_scalar)) + # compute index, unique hash mapping + unique_values_index = self.compute_unique_values_with_index(eq_rows, unique_columns) + unique_values_index = [ + (i, uq_val) + for i, uq_val in unique_values_index + if uq_val in self.start_unique_hashes + ] + # find rows with unique ids that were stored from previous run + remove_idx = pa.array(i for i, _ in unique_values_index) + # Filter the table + tbl = tbl.filter( + pa.compute.invert(pa.compute.is_in(tbl[self._dlt_index], remove_idx)) ) - start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) - tbl = tbl.filter(keep_filter) - - # Deduplicate after filtering old values - last_value_scalar = to_arrow_scalar(last_value, cursor_data_type) - tbl = self._deduplicate(tbl, unique_columns, aggregate, cursor_path) - # Remove already processed rows where the cursor is equal to the last value - eq_rows = tbl.filter(pa.compute.equal(tbl[cursor_path], last_value_scalar)) - # compute index, unique hash mapping - unique_values = self.unique_values(eq_rows, unique_columns, self.resource_name) - unique_values = [ - (i, uq_val) - for i, uq_val in unique_values - if uq_val in self.incremental_state["unique_hashes"] - ] - remove_idx = pa.array(i for i, _ in unique_values) - # Filter the table - tbl = tbl.filter(pa.compute.invert(pa.compute.is_in(tbl[self._dlt_index], remove_idx))) - - if ( - new_value_compare(row_value_scalar, last_value_scalar).as_py() - and row_value != last_value - ): # Last value has changed - self.incremental_state["last_value"] = row_value + + if ( + self.last_value is None + or new_value_compare( + row_value_scalar, to_arrow_scalar(self.last_value, cursor_data_type) + ).as_py() + ): # Last value has changed + self.last_value = row_value + if not self.deduplication_disabled: # Compute unique hashes for all rows equal to row value - self.incremental_state["unique_hashes"] = [ - uq_val - for _, uq_val in self.unique_values( + self.unique_hashes = set( + self.compute_unique_values( tbl.filter(pa.compute.equal(tbl[cursor_path], row_value_scalar)), unique_columns, - self.resource_name, - ) - ] - else: - # last value is unchanged, add the hashes - self.incremental_state["unique_hashes"] = list( - set( - self.incremental_state["unique_hashes"] - + [uq_val for _, uq_val in unique_values] ) ) - else: - tbl = self._deduplicate(tbl, unique_columns, aggregate, cursor_path) - self.incremental_state["last_value"] = row_value - self.incremental_state["unique_hashes"] = [ - uq_val - for _, uq_val in self.unique_values( - tbl.filter(pa.compute.equal(tbl[cursor_path], row_value_scalar)), - unique_columns, - self.resource_name, + elif self.last_value == row_value and not self.deduplication_disabled: + # last value is unchanged, add the hashes + self.unique_hashes.update( + set( + self.compute_unique_values( + tbl.filter(pa.compute.equal(tbl[cursor_path], row_value_scalar)), + unique_columns, + ) ) - ] + ) if len(tbl) == 0: return None, start_out_of_range, end_out_of_range diff --git a/dlt/extract/wrappers.py b/dlt/extract/wrappers.py index 7ffb6b4fc6..e761fcdeab 100644 --- a/dlt/extract/wrappers.py +++ b/dlt/extract/wrappers.py @@ -6,11 +6,17 @@ try: from dlt.common.libs.pandas import pandas + + PandaFrame = pandas.DataFrame +except MissingDependencyException: + PandaFrame = NoneType + +try: from dlt.common.libs.pyarrow import pyarrow - PandaFrame, ArrowTable, ArrowRecords = pandas.DataFrame, pyarrow.Table, pyarrow.RecordBatch + ArrowTable, ArrowRecords = pyarrow.Table, pyarrow.RecordBatch except MissingDependencyException: - PandaFrame, ArrowTable, ArrowRecords = NoneType, NoneType, NoneType + ArrowTable, ArrowRecords = NoneType, NoneType def wrap_additional_type(data: Any) -> Any: diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 9a6616e9ea..e01cf790d2 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -1,7 +1,7 @@ import functools import os from tempfile import gettempdir -from typing import Any, Callable, List, Literal, Optional, Sequence, Tuple +from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple from tenacity import ( retry_if_exception, @@ -103,6 +103,7 @@ def __init__( """ super().__init__(group_id=pipeline_name, **kwargs) + self._used_names: Dict[str, Any] = {} self.use_task_logger = use_task_logger self.log_progress_period = log_progress_period self.buffer_max_items = buffer_max_items @@ -132,6 +133,33 @@ def __init__( if ConfigProvidersContext in Container(): del Container()[ConfigProvidersContext] + def _task_name(self, pipeline: Pipeline, data: Any) -> str: + """Generate a task name. + + Args: + pipeline (Pipeline): The pipeline to run. + data (Any): The data to run the pipeline with. + + Returns: + str: The name of the task. + """ + task_name = pipeline.pipeline_name + + if isinstance(data, DltSource): + resource_names = list(data.selected_resources.keys()) + task_name = data.name + "_" + "-".join(resource_names[:4]) + + if len(resource_names) > 4: + task_name += f"-{len(resource_names)-4}-more" + + num = self._used_names.setdefault(task_name, 0) + self._used_names[task_name] = num + 1 + + if num: + task_name += f"-{num + 1}" + + return task_name + def run( self, pipeline: Pipeline, @@ -175,7 +203,7 @@ def run( schema_contract=schema_contract, pipeline_name=pipeline_name, ) - return PythonOperator(task_id=_task_name(pipeline, data), python_callable=f, **kwargs) + return PythonOperator(task_id=self._task_name(pipeline, data), python_callable=f, **kwargs) def _run( self, @@ -363,7 +391,7 @@ def make_task(pipeline: Pipeline, data: Any, name: str = None) -> PythonOperator pipeline_name=name, ) return PythonOperator( - task_id=_task_name(pipeline, data), python_callable=f, **kwargs + task_id=self._task_name(pipeline, data), python_callable=f, **kwargs ) if decompose == "none": @@ -393,7 +421,7 @@ def make_task(pipeline: Pipeline, data: Any, name: str = None) -> PythonOperator tasks = [] sources = data.decompose("scc") - t_name = _task_name(pipeline, data) + t_name = self._task_name(pipeline, data) start = make_task(pipeline, sources[0]) # parallel tasks @@ -434,16 +462,18 @@ def make_task(pipeline: Pipeline, data: Any, name: str = None) -> PythonOperator start = make_task( pipeline, sources[0], - naming.normalize_identifier(_task_name(pipeline, sources[0])), + naming.normalize_identifier(self._task_name(pipeline, sources[0])), ) # parallel tasks for source in sources[1:]: # name pipeline the same as task - new_pipeline_name = naming.normalize_identifier(_task_name(pipeline, source)) + new_pipeline_name = naming.normalize_identifier( + self._task_name(pipeline, source) + ) tasks.append(make_task(pipeline, source, new_pipeline_name)) - t_name = _task_name(pipeline, data) + t_name = self._task_name(pipeline, data) end = DummyOperator(task_id=f"{t_name}_end") if tasks: @@ -468,25 +498,3 @@ def airflow_get_execution_dates() -> Tuple[pendulum.DateTime, Optional[pendulum. return context["data_interval_start"], context["data_interval_end"] except Exception: return None, None - - -def _task_name(pipeline: Pipeline, data: Any) -> str: - """Generate a task name. - - Args: - pipeline (Pipeline): The pipeline to run. - data (Any): The data to run the pipeline with. - - Returns: - str: The name of the task. - """ - task_name = pipeline.pipeline_name - - if isinstance(data, DltSource): - resource_names = list(data.selected_resources.keys()) - task_name = data.name + "_" + "-".join(resource_names[:4]) - - if len(resource_names) > 4: - task_name += f"-{len(resource_names)-4}-more" - - return task_name diff --git a/dlt/helpers/streamlit_app/__init__.py b/dlt/helpers/streamlit_app/__init__.py new file mode 100644 index 0000000000..b304195a5a --- /dev/null +++ b/dlt/helpers/streamlit_app/__init__.py @@ -0,0 +1,11 @@ +from dlt.common.exceptions import MissingDependencyException + +# FIXME: Remove this after implementing package installer +try: + import streamlit +except ModuleNotFoundError: + raise MissingDependencyException( + "DLT Streamlit Helpers", + ["streamlit"], + "DLT Helpers for Streamlit should be run within a streamlit app.", + ) diff --git a/dlt/helpers/streamlit_app/blocks/__init__.py b/dlt/helpers/streamlit_app/blocks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dlt/helpers/streamlit_app/blocks/load_info.py b/dlt/helpers/streamlit_app/blocks/load_info.py new file mode 100644 index 0000000000..134b5ad5a4 --- /dev/null +++ b/dlt/helpers/streamlit_app/blocks/load_info.py @@ -0,0 +1,40 @@ +import dlt +import humanize +import streamlit as st + +from dlt.common import pendulum +from dlt.helpers.streamlit_app.utils import query_data_live +from dlt.helpers.streamlit_app.widgets import stat + + +def last_load_info(pipeline: dlt.Pipeline) -> None: + loads_df = query_data_live( + pipeline, + f"SELECT load_id, inserted_at FROM {pipeline.default_schema.loads_table_name} WHERE" + " status = 0 ORDER BY inserted_at DESC LIMIT 101 ", + ) + + if loads_df is None: + st.error( + "Load info is not available", + icon="🚨", + ) + else: + loads_no = loads_df.shape[0] + if loads_df.shape[0] > 0: + rel_time = ( + humanize.naturaldelta( + pendulum.now() - pendulum.from_timestamp(loads_df.iloc[0, 1].timestamp()) + ) + + " ago" + ) + last_load_id = loads_df.iloc[0, 0] + if loads_no > 100: + loads_no = "> " + str(loads_no) + else: + rel_time = "---" + last_load_id = "---" + + stat("Last load time", rel_time, border_left_width=4) + stat("Last load id", last_load_id) + stat("Total number of loads", loads_no) diff --git a/dlt/helpers/streamlit_app/blocks/menu.py b/dlt/helpers/streamlit_app/blocks/menu.py new file mode 100644 index 0000000000..b6d0b5f7aa --- /dev/null +++ b/dlt/helpers/streamlit_app/blocks/menu.py @@ -0,0 +1,14 @@ +import dlt +import streamlit as st + +from dlt.helpers.streamlit_app.utils import HERE +from dlt.helpers.streamlit_app.widgets import logo, mode_selector +from dlt.helpers.streamlit_app.widgets import pipeline_summary + + +def menu(pipeline: dlt.Pipeline) -> None: + mode_selector() + logo() + st.page_link(f"{HERE}/pages/dashboard.py", label="Explore data", icon="🕹️") + st.page_link(f"{HERE}/pages/load_info.py", label="Load info", icon="💾") + pipeline_summary(pipeline) diff --git a/dlt/helpers/streamlit_app/blocks/query.py b/dlt/helpers/streamlit_app/blocks/query.py new file mode 100644 index 0000000000..a03e9a0cd9 --- /dev/null +++ b/dlt/helpers/streamlit_app/blocks/query.py @@ -0,0 +1,57 @@ +from typing import Optional +import dlt +import streamlit as st + +from dlt.common.exceptions import MissingDependencyException +from dlt.helpers.streamlit_app.utils import query_data + + +def maybe_run_query( + pipeline: dlt.Pipeline, + show_charts: bool = True, + example_query: Optional[str] = "", +) -> None: + st.subheader("Run your query") + sql_query = st.text_area("Enter your SQL query", value=example_query) + if st.button("Run Query"): + if sql_query: + try: + # run the query from the text area + df = query_data(pipeline, sql_query, chunk_size=2048) + if df is None: + st.text("No rows returned") + else: + rows_count = df.shape[0] + st.text(f"{rows_count} row(s) returned") + st.dataframe(df) + try: + # now if the dataset has supported shape try to display the bar or altair chart + if df.dtypes.shape[0] == 1 and show_charts: + # try barchart + st.bar_chart(df) + if df.dtypes.shape[0] == 2 and show_charts: + # try to import altair charts + try: + import altair as alt + except ModuleNotFoundError: + raise MissingDependencyException( + "DLT Streamlit Helpers", + ["altair"], + "DLT Helpers for Streamlit should be run within a streamlit" + " app.", + ) + + # try altair + bar_chart = ( + alt.Chart(df) + .mark_bar() + .encode( + x=f"{df.columns[1]}:Q", y=alt.Y(f"{df.columns[0]}:N", sort="-x") + ) + ) + st.altair_chart(bar_chart, use_container_width=True) + except Exception as ex: + st.error(f"Chart failed due to: {ex}") + except Exception as ex: + st.text("Exception when running query") + st.exception(ex) diff --git a/dlt/helpers/streamlit_app/blocks/resource_state.py b/dlt/helpers/streamlit_app/blocks/resource_state.py new file mode 100644 index 0000000000..8ea1256a1f --- /dev/null +++ b/dlt/helpers/streamlit_app/blocks/resource_state.py @@ -0,0 +1,29 @@ +import dlt +import streamlit as st +import yaml + +from dlt.common import json +from dlt.common.libs.pandas import pandas as pd +from dlt.common.pipeline import resource_state, TSourceState +from dlt.common.schema.utils import group_tables_by_resource +from dlt.helpers.streamlit_app.widgets.tags import tag + + +def resource_state_info( + pipeline: dlt.Pipeline, + schema_name: str, + resource_name: str, +) -> None: + sources_state = pipeline.state.get("sources") or {} + schema = sources_state.get(schema_name) + if not schema: + st.error(f"Schema with name: {schema_name} is not found") + return + + resource = schema["resources"].get(resource_name) + with st.expander("Resource state", expanded=(resource is None)): + if not resource: + st.info(f"{resource_name} is missing resource state") + else: + spec = yaml.safe_dump(resource) + st.code(spec, language="yaml") diff --git a/dlt/helpers/streamlit_app/blocks/show_data.py b/dlt/helpers/streamlit_app/blocks/show_data.py new file mode 100644 index 0000000000..7aaab084f3 --- /dev/null +++ b/dlt/helpers/streamlit_app/blocks/show_data.py @@ -0,0 +1,21 @@ +from typing import List + +import dlt +import streamlit as st + +from dlt.helpers.streamlit_app.utils import query_data + + +def show_data_button(pipeline: dlt.Pipeline, table_name: str) -> None: + if st.button("SHOW DATA", key=table_name): + df = query_data(pipeline, f"SELECT * FROM {table_name}", chunk_size=2048) + if df is None: + st.text("No rows returned") + else: + rows_count = df.shape[0] + if df.shape[0] < 2048: + st.text(f"All {rows_count} row(s)") + else: + st.text(f"Top {rows_count} row(s)") + + st.dataframe(df) diff --git a/dlt/helpers/streamlit_app/blocks/table_hints.py b/dlt/helpers/streamlit_app/blocks/table_hints.py new file mode 100644 index 0000000000..aefab952e5 --- /dev/null +++ b/dlt/helpers/streamlit_app/blocks/table_hints.py @@ -0,0 +1,80 @@ +from typing import Any, Dict, List + +import dlt +import streamlit as st + +from dlt.common.schema.typing import TTableSchema +from dlt.common.utils import flatten_list_or_items +from dlt.helpers.streamlit_app.blocks.resource_state import resource_state_info +from dlt.helpers.streamlit_app.blocks.show_data import show_data_button + + +def list_table_hints(pipeline: dlt.Pipeline, tables: List[TTableSchema]) -> None: + current_schema = st.session_state["schema"] or pipeline.default_schema + if st.session_state["schema"]: + current_schema = st.session_state["schema"] + + for table in tables: + table_hints: List[str] = [] + if "parent" in table: + table_hints.append("parent: **%s**" % table["parent"]) + + if "resource" in table: + table_hints.append("resource: **%s**" % table["resource"]) + + if "write_disposition" in table: + table_hints.append("write disposition: **%s**" % table["write_disposition"]) + + columns = table["columns"] + primary_keys: List[str] = list( + flatten_list_or_items( + [ + col_name + for col_name in columns.keys() + if not col_name.startswith("_") + and columns[col_name].get("primary_key") is not None + ] + ) + ) + if primary_keys: + table_hints.append("primary key(s): **%s**" % ", ".join(primary_keys)) + + merge_keys = list( + flatten_list_or_items( + [ + col_name + for col_name in columns.keys() + if not col_name.startswith("_") + and not columns[col_name].get("merge_key") is None # noqa: E714 + ] + ) + ) + + if merge_keys: + table_hints.append("merge key(s): **%s**" % ", ".join(merge_keys)) + + st.subheader(f"Table: {table['name']}", divider=True) + st.markdown(" | ".join(table_hints)) + if "resource" in table: + resource_state_info( + pipeline, + current_schema.name, + table["resource"], + ) + + # table schema contains various hints (like clustering or partition options) + # that we do not want to show in basic view + def essentials_f(c: Any) -> Dict[str, Any]: + essentials: Dict[str, Any] = {} + for k, v in c.items(): + if k in ["name", "data_type", "nullable"]: + essentials[k] = v + + return { + "name": essentials["name"], + "data_type": essentials["data_type"], + "nullable": essentials["nullable"], + } + + st.table(map(essentials_f, table["columns"].values())) + show_data_button(pipeline, table["name"]) diff --git a/dlt/helpers/streamlit_app/index.py b/dlt/helpers/streamlit_app/index.py new file mode 100644 index 0000000000..31fb470640 --- /dev/null +++ b/dlt/helpers/streamlit_app/index.py @@ -0,0 +1,6 @@ +import streamlit as st + +from dlt.helpers.streamlit_app.utils import HERE + +if __name__ == "__main__": + st.switch_page(f"{HERE}/pages/dashboard.py") diff --git a/dlt/helpers/streamlit_app/pages/__init__.py b/dlt/helpers/streamlit_app/pages/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dlt/helpers/streamlit_app/pages/dashboard.py b/dlt/helpers/streamlit_app/pages/dashboard.py new file mode 100644 index 0000000000..656dd6ecdf --- /dev/null +++ b/dlt/helpers/streamlit_app/pages/dashboard.py @@ -0,0 +1,53 @@ +import dlt +import streamlit as st + +from dlt.helpers.streamlit_app.blocks.query import maybe_run_query +from dlt.helpers.streamlit_app.blocks.table_hints import list_table_hints +from dlt.helpers.streamlit_app.blocks.menu import menu +from dlt.helpers.streamlit_app.utils import render_with_pipeline +from dlt.helpers.streamlit_app.widgets import schema_picker +from dlt.pipeline import Pipeline + + +def write_data_explorer_page( + pipeline: Pipeline, + schema_name: str = None, + example_query: str = "", + show_charts: bool = True, +) -> None: + """Writes Streamlit app page with a schema and live data preview. + + #### Args: + pipeline (Pipeline): Pipeline instance to use. + schema_name (str, optional): Name of the schema to display. If None, default schema is used. + example_query (str, optional): Example query to be displayed in the SQL Query box. + show_charts (bool, optional): Should automatically show charts for the queries from SQL Query box. Defaults to True. + + Raises: + MissingDependencyException: Raised when a particular python dependency is not installed + """ + + st.subheader("Schemas and tables", divider="rainbow") + schema_picker(pipeline) + tables = sorted( + st.session_state["schema"].data_tables(), + key=lambda table: table["name"], + ) + + list_table_hints(pipeline, tables) + maybe_run_query( + pipeline, + show_charts=show_charts, + example_query=example_query, + ) + + +def show(pipeline: dlt.Pipeline) -> None: + with st.sidebar: + menu(pipeline) + + write_data_explorer_page(pipeline) + + +if __name__ == "__main__": + render_with_pipeline(show) diff --git a/dlt/helpers/streamlit_app/pages/load_info.py b/dlt/helpers/streamlit_app/pages/load_info.py new file mode 100644 index 0000000000..ee13cf2531 --- /dev/null +++ b/dlt/helpers/streamlit_app/pages/load_info.py @@ -0,0 +1,130 @@ +import dlt +import streamlit as st + +from dlt.common.configuration.exceptions import ConfigFieldMissingException +from dlt.common.destination.reference import WithStateSync +from dlt.helpers.streamlit_app.blocks.load_info import last_load_info +from dlt.helpers.streamlit_app.blocks.menu import menu +from dlt.helpers.streamlit_app.widgets import stat +from dlt.helpers.streamlit_app.utils import ( + query_data, + query_data_live, + render_with_pipeline, +) +from dlt.pipeline import Pipeline +from dlt.pipeline.exceptions import CannotRestorePipelineException +from dlt.pipeline.state_sync import load_pipeline_state_from_destination + + +def write_load_status_page(pipeline: Pipeline) -> None: + """Display pipeline loading information.""" + + try: + loads_df = query_data_live( + pipeline, + f"SELECT load_id, inserted_at FROM {pipeline.default_schema.loads_table_name} WHERE" + " status = 0 ORDER BY inserted_at DESC LIMIT 101 ", + ) + + if loads_df is not None: + selected_load_id = st.selectbox("Select load id", loads_df) + schema = pipeline.default_schema + + st.markdown("**Number of loaded rows:**") + + # construct a union query + query_parts = [] + for table in schema.data_tables(): + if "parent" in table: + continue + table_name = table["name"] + query_parts.append( + f"SELECT '{table_name}' as table_name, COUNT(1) As rows_count FROM" + f" {table_name} WHERE _dlt_load_id = '{selected_load_id}'" + ) + query_parts.append("UNION ALL") + + query_parts.pop() + rows_counts_df = query_data(pipeline, "\n".join(query_parts)) + + st.markdown(f"Rows loaded in **{selected_load_id}**") + st.dataframe(rows_counts_df) + + st.markdown("**Last 100 loads**") + st.dataframe(loads_df) + + st.subheader("Schema updates", divider=True) + schemas_df = query_data_live( + pipeline, + "SELECT schema_name, inserted_at, version, version_hash FROM" + f" {pipeline.default_schema.version_table_name} ORDER BY inserted_at DESC LIMIT" + " 101 ", + ) + st.markdown("**100 recent schema updates**") + st.dataframe(schemas_df) + except CannotRestorePipelineException as restore_ex: + st.error("Seems like the pipeline does not exist. Did you run it at least once?") + st.exception(restore_ex) + + except ConfigFieldMissingException as cf_ex: + st.error( + "Pipeline credentials/configuration is missing. This most often happen when you run the" + " streamlit app from different folder than the `.dlt` with `toml` files resides." + ) + st.text(str(cf_ex)) + + except Exception as ex: + st.error("Pipeline info could not be prepared. Did you load the data at least once?") + st.exception(ex) + + +def show_state_versions(pipeline: dlt.Pipeline) -> None: + st.subheader("State info", divider=True) + remote_state = None + with pipeline.destination_client() as client: + if isinstance(client, WithStateSync): + remote_state = load_pipeline_state_from_destination(pipeline.pipeline_name, client) + + local_state = pipeline.state + + remote_state_version = "---" + if remote_state: + remote_state_version = str(remote_state["_state_version"]) + + col1, col2 = st.columns(2) + with col1: + stat( + label="Local version", + value=local_state["_state_version"], + display="block", + border_left_width=4, + ) + + with col2: + stat( + label="Remote version", + value=remote_state_version, + display="block", + border_left_width=4, + ) + + if remote_state_version != str(local_state["_state_version"]): + st.text("") + st.warning( + "Looks like that local state is not yet synchronized or synchronization is disabled", + icon="⚠️", + ) + + +def show(pipeline: dlt.Pipeline) -> None: + st.subheader("Load info", divider="rainbow") + last_load_info(pipeline) + write_load_status_page(pipeline) + show_state_versions(pipeline) + + with st.sidebar: + menu(pipeline) + + +if __name__ == "__main__": + render_with_pipeline(show) diff --git a/dlt/helpers/streamlit_app/theme.py b/dlt/helpers/streamlit_app/theme.py new file mode 100644 index 0000000000..3b6b600a73 --- /dev/null +++ b/dlt/helpers/streamlit_app/theme.py @@ -0,0 +1,29 @@ +import streamlit as st + + +def dark_theme() -> None: + st.config.set_option("theme.base", "dark") + st.config.set_option("theme.primaryColor", "#191937") + + # Main background + st.config.set_option("theme.backgroundColor", "#4C4898") + + # Sidebar + st.config.set_option("theme.secondaryBackgroundColor", "#191937") + + # Text + st.config.set_option("theme.textColor", "#FEFEFA") + + +def light_theme() -> None: + st.config.set_option("theme.base", "light") + st.config.set_option("theme.primaryColor", "#333") + + # Main background + st.config.set_option("theme.backgroundColor", "#FEFEFE") + + # Sidebar + st.config.set_option("theme.secondaryBackgroundColor", "#ededed") + + # Text + st.config.set_option("theme.textColor", "#333") diff --git a/dlt/helpers/streamlit_app/utils.py b/dlt/helpers/streamlit_app/utils.py new file mode 100644 index 0000000000..6b2dab495c --- /dev/null +++ b/dlt/helpers/streamlit_app/utils.py @@ -0,0 +1,77 @@ +import argparse +import os + +from pathlib import Path +from typing import Optional, Callable, Tuple + +import dlt +import pandas as pd +import streamlit as st + +from dlt.cli import echo as fmt +from dlt.pipeline.exceptions import SqlClientNotAvailable + +HERE = Path(__file__).absolute().parent + + +def parse_args() -> Tuple[str, str]: + parser = argparse.ArgumentParser() + parser.add_argument("pipeline_name", nargs=1) + parser.add_argument( + "--pipelines-dir", + help="Pipelines working directory", + default=None, + ) + known_options, _ = parser.parse_known_args() + return known_options.pipeline_name[0], known_options.pipelines_dir + + +def render_with_pipeline(render_func: Callable[..., None]) -> None: + pipeline_name, pipelines_dir = parse_args() + if test_pipeline_name := os.getenv("DLT_TEST_PIPELINE_NAME"): + fmt.echo(f"RUNNING TEST PIPELINE: {test_pipeline_name}") + pipeline_name = test_pipeline_name + + st.session_state["pipeline_name"] = pipeline_name + # use pipelines dir from env var or try to resolve it using get_dlt_pipelines_dir + pipeline = dlt.attach(pipeline_name, pipelines_dir=pipelines_dir) + render_func(pipeline) + + +def query_using_cache( + pipeline: dlt.Pipeline, ttl: int +) -> Callable[..., Optional[pd.DataFrame]]: + @st.cache_data(ttl=ttl) + def do_query( # type: ignore[return] + query: str, + schema_name: str = None, + chunk_size: Optional[int] = None, + ) -> Optional[pd.DataFrame]: + try: + with pipeline.sql_client(schema_name) as client: + with client.execute_query(query) as curr: + return curr.df(chunk_size=chunk_size) + except SqlClientNotAvailable: + st.error("🚨 Cannot load data - SqlClient not available") + + return do_query # type: ignore + + +def query_data( + pipeline: dlt.Pipeline, + query: str, + schema_name: str = None, + chunk_size: Optional[int] = None, +) -> pd.DataFrame: + query_maker = query_using_cache(pipeline, ttl=600) + return query_maker(query, schema_name, chunk_size=chunk_size) + + +def query_data_live( + pipeline: dlt.Pipeline, + query: str, + schema_name: str = None, + chunk_size: Optional[int] = None, +) -> pd.DataFrame: + query_maker = query_using_cache(pipeline, ttl=5) + return query_maker(query, schema_name, chunk_size=chunk_size) diff --git a/dlt/helpers/streamlit_app/widgets/__init__.py b/dlt/helpers/streamlit_app/widgets/__init__.py new file mode 100644 index 0000000000..349d58166e --- /dev/null +++ b/dlt/helpers/streamlit_app/widgets/__init__.py @@ -0,0 +1,6 @@ +from dlt.helpers.streamlit_app.widgets.logo import logo +from dlt.helpers.streamlit_app.widgets.stats import stat +from dlt.helpers.streamlit_app.widgets.summary import pipeline_summary +from dlt.helpers.streamlit_app.widgets.tags import tag +from dlt.helpers.streamlit_app.widgets.schema import schema_picker +from dlt.helpers.streamlit_app.widgets.color_mode_selector import mode_selector diff --git a/dlt/helpers/streamlit_app/widgets/color_mode_selector.py b/dlt/helpers/streamlit_app/widgets/color_mode_selector.py new file mode 100644 index 0000000000..fba3231a34 --- /dev/null +++ b/dlt/helpers/streamlit_app/widgets/color_mode_selector.py @@ -0,0 +1,34 @@ +import streamlit as st + +from typing_extensions import Callable, Literal + +from dlt.helpers.streamlit_app.theme import dark_theme, light_theme + +ColorMode = Literal["light", "dark"] + + +def set_color_mode(mode: ColorMode) -> Callable[..., None]: + def set_mode() -> None: + st.session_state["color_mode"] = mode + if mode and mode == "dark": + dark_theme() + else: + light_theme() + + return set_mode + + +def mode_selector() -> None: + columns = st.columns(10) + light = columns[3] + dark = columns[5] + + # Set default theme to light if it wasn't set before + if not st.session_state.get("color_mode"): + st.session_state["color_mode"] = "light" + st.config.set_option("theme.base", "light") + + with light: + st.button("☀️", on_click=set_color_mode("light")) + with dark: + st.button("🌚", on_click=set_color_mode("dark")) diff --git a/dlt/helpers/streamlit_app/widgets/logo.py b/dlt/helpers/streamlit_app/widgets/logo.py new file mode 100644 index 0000000000..41a5afff44 --- /dev/null +++ b/dlt/helpers/streamlit_app/widgets/logo.py @@ -0,0 +1,46 @@ +import streamlit as st + + +def logo() -> None: + logo_text = """ + + """ + styles = """ + + """ + + st.markdown(logo_text + styles, unsafe_allow_html=True) diff --git a/dlt/helpers/streamlit_app/widgets/schema.py b/dlt/helpers/streamlit_app/widgets/schema.py new file mode 100644 index 0000000000..f7883bc45e --- /dev/null +++ b/dlt/helpers/streamlit_app/widgets/schema.py @@ -0,0 +1,21 @@ +import dlt +import streamlit as st + + +def schema_picker(pipeline: dlt.Pipeline) -> None: + schema = None + num_schemas = len(pipeline.schema_names) + if num_schemas == 1: + schema_name = pipeline.schema_names[0] + schema = pipeline.schemas.get(schema_name) + elif num_schemas > 1: + text = "Select schema" + selected_schema_name = st.selectbox( + text, + sorted(pipeline.schema_names), + ) + schema = pipeline.schemas.get(selected_schema_name) + + if schema: + st.subheader(f"Schema: {schema.name}") + st.session_state["schema"] = schema diff --git a/dlt/helpers/streamlit_app/widgets/stats.py b/dlt/helpers/streamlit_app/widgets/stats.py new file mode 100644 index 0000000000..d0fded508b --- /dev/null +++ b/dlt/helpers/streamlit_app/widgets/stats.py @@ -0,0 +1,58 @@ +from typing import Any, Optional +import streamlit as st + + +def stat( + label: str, + value: Any, + width: Optional[str] = "100%", + display: Optional[str] = "inline-block", + background_color: Optional[str] = "#0e1111", + border_radius: Optional[int] = 4, + border_color: Optional[str] = "#272736", + border_left_color: Optional[str] = "#007b05", + border_left_width: Optional[int] = 0, +) -> None: + stat_html = f""" +
+

{label}

+

{value}

+
+ """ + mode = st.session_state.get("color_mode", "dark") + if mode == "light": + background_color = "#FEFEFE" + border_left_color = "#333333" + + styles = """ + .stat { + display: %s; + width: %s; + border-radius: %dpx; + border: 1px solid %s; + background-color: %s; + padding: 2%% 2%% 1%% 5%%; + margin-bottom: 2%%; + } + .stat-label { + font-size: 14px; + margin-bottom: 5px; + } + .stat-value { + font-size: 32px; + margin-bottom: 0; + } + %s + """ % (display, width, border_radius, border_color, background_color, "") + + if border_left_width > 1: + styles += """ + .stat { + border-left: %dpx solid %s !important; + } + """ % (border_left_width, border_left_color) + + st.markdown( + stat_html + f"", + unsafe_allow_html=True, + ) diff --git a/dlt/helpers/streamlit_app/widgets/summary.py b/dlt/helpers/streamlit_app/widgets/summary.py new file mode 100644 index 0000000000..afbefbe608 --- /dev/null +++ b/dlt/helpers/streamlit_app/widgets/summary.py @@ -0,0 +1,21 @@ +import dlt +import streamlit as st +from dlt.pipeline.exceptions import SqlClientNotAvailable + + +def pipeline_summary(pipeline: dlt.Pipeline) -> None: + try: + credentials = pipeline.sql_client().credentials + except SqlClientNotAvailable: + credentials = "---" + st.error("🚨 Cannot load data - SqlClient not available") + + schema_names = ", ".join(sorted(pipeline.schema_names)) + st.subheader("Pipeline info", divider=True) + st.markdown(f""" + * pipeline name: **{pipeline.pipeline_name}** + * destination: **{str(credentials)}** in **{pipeline.destination.destination_description}** + * dataset name: **{pipeline.dataset_name}** + * default schema name: **{pipeline.default_schema_name}** + * all schema names: **{schema_names}** + """) diff --git a/dlt/helpers/streamlit_app/widgets/tags.py b/dlt/helpers/streamlit_app/widgets/tags.py new file mode 100644 index 0000000000..a591e50efe --- /dev/null +++ b/dlt/helpers/streamlit_app/widgets/tags.py @@ -0,0 +1,41 @@ +from typing import Optional, Literal + +import streamlit as st + +TagType = Literal["info", "success", "warning", "error", "mute"] + + +def tag( + tag_name: str, + label: Optional[str] = None, + border_radius: Optional[int] = 4, + bold: Optional[bool] = False, + tag_type: Optional[TagType] = "mute", +) -> None: + tag_html = f""" + {str(label)+": " if label else ""}{tag_name} + """ + kinds = { + "mute": {"text_color": "#495057", "bg_color": "#e9ecef"}, + "info": {"text_color": "#1864ab", "bg_color": "#4dabf7"}, + "success": {"text_color": "#2b8a3e", "bg_color": "#8ce99a"}, + "warning": {"text_color": "#d9480f", "bg_color": "#ffa94d"}, + "error": {"text_color": "#c92a2a", "bg_color": "#ffe3e3"}, + } + kind = kinds[tag_type] + bg_color = kind["bg_color"] + text_color = kind["text_color"] + + styles = """ + + """ % (border_radius, bg_color, text_color, "600" if bold else "normal") + + st.markdown(tag_html + styles, unsafe_allow_html=True) diff --git a/dlt/helpers/streamlit_helper.py b/dlt/helpers/streamlit_helper.py deleted file mode 100644 index f6b2f3a62f..0000000000 --- a/dlt/helpers/streamlit_helper.py +++ /dev/null @@ -1,386 +0,0 @@ -import sys -from typing import Dict, List, Iterator -import humanize - -from dlt.common import pendulum -from dlt.common.typing import AnyFun -from dlt.common.configuration.exceptions import ConfigFieldMissingException -from dlt.common.exceptions import MissingDependencyException -from dlt.common.destination.reference import WithStateSync -from dlt.common.utils import flatten_list_or_items - -from dlt.common.libs.pandas import pandas -from dlt.pipeline import Pipeline -from dlt.pipeline.exceptions import CannotRestorePipelineException, SqlClientNotAvailable -from dlt.pipeline.state_sync import load_state_from_destination - -try: - import streamlit as st - - # from streamlit import SECRETS_FILE_LOC, secrets -except ModuleNotFoundError: - raise MissingDependencyException( - "DLT Streamlit Helpers", - ["streamlit"], - "DLT Helpers for Streamlit should be run within a streamlit app.", - ) - - -# use right caching function to disable deprecation message -if hasattr(st, "cache_data"): - cache_data = st.cache_data -else: - cache_data = st.experimental_memo - -# def restore_pipeline() -> Pipeline: -# """Restores Pipeline instance and associated credentials from Streamlit secrets - -# Current implementation requires that pipeline working dir is available at the location saved in secrets. - -# Raises: -# PipelineBackupNotFound: Raised when pipeline backup is not available -# CannotRestorePipelineException: Raised when pipeline working dir is not found or invalid - -# Returns: -# Pipeline: Instance of pipeline with attached credentials -# """ -# if "dlt" not in secrets: -# raise PipelineException("You must backup pipeline to Streamlit first") -# dlt_cfg = secrets["dlt"] -# credentials = deepcopy(dict(dlt_cfg["destination"])) -# if "default_schema_name" in credentials: -# del credentials["default_schema_name"] -# credentials.update(dlt_cfg["credentials"]) -# pipeline = Pipeline(dlt_cfg["pipeline_name"]) -# pipeline.restore_pipeline(credentials_from_dict(credentials), dlt_cfg["working_dir"]) -# return pipeline - - -# def backup_pipeline(pipeline: Pipeline) -> None: -# """Backups pipeline state to the `secrets.toml` of the Streamlit app. - -# Pipeline credentials and working directory will be added to the Streamlit `secrets` file. This allows to access query the data loaded to the destination and -# access definitions of the inferred schemas. See `restore_pipeline` and `write_data_explorer_page` functions in the same module. - -# Args: -# pipeline (Pipeline): Pipeline instance, typically restored with `restore_pipeline` -# """ -# # save pipeline state to project .config -# # config_file_name = file_util.get_project_streamlit_file_path("config.toml") - -# # save credentials to secrets -# if os.path.isfile(SECRETS_FILE_LOC): -# with open(SECRETS_FILE_LOC, "r", encoding="utf-8") as f: -# # use whitespace preserving parser -# secrets_ = tomlkit.load(f) -# else: -# secrets_ = tomlkit.document() - -# # save general settings -# secrets_["dlt"] = { -# "working_dir": pipeline.working_dir, -# "pipeline_name": pipeline.pipeline_name -# } - -# # get client config -# # TODO: pipeline api v2 should provide a direct method to get configurations -# CONFIG: BaseConfiguration = pipeline._loader_instance.load_client_cls.CONFIG # type: ignore -# CREDENTIALS: CredentialsConfiguration = pipeline._loader_instance.load_client_cls.CREDENTIALS # type: ignore - -# # save client config -# # print(dict_remove_nones_in_place(CONFIG.as_dict(lowercase=False))) -# dlt_c = cast(TomlContainer, secrets_["dlt"]) -# dlt_c["destination"] = dict_remove_nones_in_place(dict(CONFIG)) -# dlt_c["credentials"] = dict_remove_nones_in_place(dict(CREDENTIALS)) - -# with open(SECRETS_FILE_LOC, "w", encoding="utf-8") as f: -# # use whitespace preserving parser -# tomlkit.dump(secrets_, f) - - -def write_load_status_page(pipeline: Pipeline) -> None: - """Display pipeline loading information. Will be moved to dlt package once tested""" - - @cache_data(ttl=600) - def _query_data(query: str, schema_name: str = None) -> pandas.DataFrame: - try: - with pipeline.sql_client(schema_name) as client: - with client.execute_query(query) as curr: - return curr.df() - except SqlClientNotAvailable: - st.error("Cannot load data - SqlClient not available") - - @cache_data(ttl=5) - def _query_data_live(query: str, schema_name: str = None) -> pandas.DataFrame: - try: - with pipeline.sql_client(schema_name) as client: - with client.execute_query(query) as curr: - return curr.df() - except SqlClientNotAvailable: - st.error("Cannot load data - SqlClient not available") - - try: - st.header("Pipeline info") - credentials = pipeline.sql_client().credentials - schema_names = ", ".join(sorted(pipeline.schema_names)) - st.markdown(f""" - * pipeline name: **{pipeline.pipeline_name}** - * destination: **{str(credentials)}** in **{pipeline.destination.destination_description}** - * dataset name: **{pipeline.dataset_name}** - * default schema name: **{pipeline.default_schema_name}** - * all schema names: **{schema_names}** - """) - - st.header("Last load info") - col1, col2, col3 = st.columns(3) - loads_df = _query_data_live( - f"SELECT load_id, inserted_at FROM {pipeline.default_schema.loads_table_name} WHERE" - " status = 0 ORDER BY inserted_at DESC LIMIT 101 " - ) - loads_no = loads_df.shape[0] - if loads_df.shape[0] > 0: - rel_time = ( - humanize.naturaldelta( - pendulum.now() - pendulum.from_timestamp(loads_df.iloc[0, 1].timestamp()) - ) - + " ago" - ) - last_load_id = loads_df.iloc[0, 0] - if loads_no > 100: - loads_no = "> " + str(loads_no) - else: - rel_time = "---" - last_load_id = "---" - col1.metric("Last load time", rel_time) - col2.metric("Last load id", last_load_id) - col3.metric("Total number of loads", loads_no) - - st.markdown("**Number of loaded rows:**") - selected_load_id = st.selectbox("Select load id", loads_df) - schema = pipeline.default_schema - - # construct a union query - query_parts = [] - for table in schema.data_tables(): - if "parent" in table: - continue - table_name = table["name"] - query_parts.append( - f"SELECT '{table_name}' as table_name, COUNT(1) As rows_count FROM" - f" {table_name} WHERE _dlt_load_id = '{selected_load_id}'" - ) - query_parts.append("UNION ALL") - query_parts.pop() - rows_counts_df = _query_data("\n".join(query_parts)) - - st.markdown(f"Rows loaded in **{selected_load_id}**") - st.dataframe(rows_counts_df) - - st.markdown("**Last 100 loads**") - st.dataframe(loads_df) - - st.header("Schema updates") - schemas_df = _query_data_live( - "SELECT schema_name, inserted_at, version, version_hash FROM" - f" {pipeline.default_schema.version_table_name} ORDER BY inserted_at DESC LIMIT 101 " - ) - st.markdown("**100 recent schema updates**") - st.dataframe(schemas_df) - - st.header("Pipeline state info") - with pipeline.destination_client() as client: - if isinstance(client, WithStateSync): - remote_state = load_state_from_destination(pipeline.pipeline_name, client) - local_state = pipeline.state - - col1, col2 = st.columns(2) - if remote_state: - remote_state_version = remote_state["_state_version"] - else: - remote_state_version = "---" # type: ignore - - col1.metric("Local state version", local_state["_state_version"]) - col2.metric("Remote state version", remote_state_version) - - if remote_state_version != local_state["_state_version"]: - st.warning( - "Looks like that local state is not yet synchronized or synchronization is disabled" - ) - - except CannotRestorePipelineException as restore_ex: - st.error("Seems like the pipeline does not exist. Did you run it at least once?") - st.exception(restore_ex) - - except ConfigFieldMissingException as cf_ex: - st.error( - "Pipeline credentials/configuration is missing. This most often happen when you run the" - " streamlit app from different folder than the `.dlt` with `toml` files resides." - ) - st.text(str(cf_ex)) - - except Exception as ex: - st.error("Pipeline info could not be prepared. Did you load the data at least once?") - st.exception(ex) - - -def write_data_explorer_page( - pipeline: Pipeline, - schema_name: str = None, - show_dlt_tables: bool = False, - example_query: str = "", - show_charts: bool = True, -) -> None: - """Writes Streamlit app page with a schema and live data preview. - - #### Args: - pipeline (Pipeline): Pipeline instance to use. - schema_name (str, optional): Name of the schema to display. If None, default schema is used. - show_dlt_tables (bool, optional): Should show dlt internal tables. Defaults to False. - example_query (str, optional): Example query to be displayed in the SQL Query box. - show_charts (bool, optional): Should automatically show charts for the queries from SQL Query box. Defaults to True. - - Raises: - MissingDependencyException: Raised when a particular python dependency is not installed - """ - - @cache_data(ttl=60) - def _query_data(query: str, chunk_size: int = None) -> pandas.DataFrame: - try: - with pipeline.sql_client(schema_name) as client: - with client.execute_query(query) as curr: - return curr.df(chunk_size=chunk_size) - except SqlClientNotAvailable: - st.error("Cannot load data - SqlClient not available") - - st.header("Schemas and their tables") - - num_schemas = len(pipeline.schema_names) - if num_schemas == 1: - schema_name = pipeline.schema_names[0] - selected_schema = pipeline.schemas.get(schema_name) - st.subheader(f"Schema: {schema_name}") - elif num_schemas > 1: - st.subheader("Schema:") - text = "Pick a schema name to see all its tables below" - selected_schema_name = st.selectbox(text, sorted(pipeline.schema_names)) - selected_schema = pipeline.schemas.get(selected_schema_name) - - for table in sorted(selected_schema.data_tables(), key=lambda table: table["name"]): - table_name = table["name"] - st.subheader(f"Table: {table_name}") - if "description" in table: - st.text(table["description"]) - table_hints: List[str] = [] - if "parent" in table: - table_hints.append("parent: **%s**" % table["parent"]) - if "resource" in table: - table_hints.append("resource: **%s**" % table["resource"]) - if "write_disposition" in table: - table_hints.append("write disposition: **%s**" % table["write_disposition"]) - columns = table["columns"] - primary_keys: Iterator[str] = flatten_list_or_items( - [ - col_name - for col_name in columns.keys() - if not col_name.startswith("_") and not columns[col_name].get("primary_key") is None - ] - ) - table_hints.append("primary key(s): **%s**" % ", ".join(primary_keys)) - merge_keys = flatten_list_or_items( - [ - col_name - for col_name in columns.keys() - if not col_name.startswith("_") and not columns[col_name].get("merge_key") is None - ] - ) - table_hints.append("merge key(s): **%s**" % ", ".join(merge_keys)) - - st.markdown(" | ".join(table_hints)) - - # table schema contains various hints (like clustering or partition options) that we do not want to show in basic view - essentials_f = lambda c: { - k: v for k, v in c.items() if k in ["name", "data_type", "nullable"] - } - - st.table(map(essentials_f, table["columns"].values())) - # add a button that when pressed will show the full content of a table - if st.button("SHOW DATA", key=table_name): - df = _query_data(f"SELECT * FROM {table_name}", chunk_size=2048) - if df is None: - st.text("No rows returned") - else: - rows_count = df.shape[0] - if df.shape[0] < 2048: - st.text(f"All {rows_count} row(s)") - else: - st.text(f"Top {rows_count} row(s)") - st.dataframe(df) - - st.header("Run your query") - sql_query = st.text_area("Enter your SQL query", value=example_query) - if st.button("Run Query"): - if sql_query: - try: - # run the query from the text area - df = _query_data(sql_query) - if df is None: - st.text("No rows returned") - else: - rows_count = df.shape[0] - st.text(f"{rows_count} row(s) returned") - st.dataframe(df) - try: - # now if the dataset has supported shape try to display the bar or altair chart - if df.dtypes.shape[0] == 1 and show_charts: - # try barchart - st.bar_chart(df) - if df.dtypes.shape[0] == 2 and show_charts: - # try to import altair charts - try: - import altair as alt - except ModuleNotFoundError: - raise MissingDependencyException( - "DLT Streamlit Helpers", - ["altair"], - "DLT Helpers for Streamlit should be run within a streamlit" - " app.", - ) - - # try altair - bar_chart = ( - alt.Chart(df) - .mark_bar() - .encode( - x=f"{df.columns[1]}:Q", y=alt.Y(f"{df.columns[0]}:N", sort="-x") - ) - ) - st.altair_chart(bar_chart, use_container_width=True) - except Exception as ex: - st.error(f"Chart failed due to: {ex}") - except Exception as ex: - st.text("Exception when running query") - st.exception(ex) - - -def display(pipeline_name: str) -> None: - import dlt - - pipeline = dlt.attach(pipeline_name) - - pages: Dict[str, AnyFun] = { - "Explore data": write_data_explorer_page, - "Load info": write_load_status_page, - } - - st.title(f"Show {pipeline_name} pipeline") - - st.sidebar.title("Navigation") - selection = st.sidebar.radio("Go to", list(pages.keys())) - page = pages[selection] - - with st.spinner("Loading Page ..."): - page(pipeline) - - -if __name__ == "__main__": - display(sys.argv[1]) diff --git a/dlt/load/load.py b/dlt/load/load.py index 050e7bce67..23c3dea820 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -7,10 +7,17 @@ from dlt.common import sleep, logger from dlt.common.configuration import with_config, known_sections +from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.accessors import config -from dlt.common.pipeline import LoadInfo, LoadMetrics, SupportsPipeline, WithStepInfo -from dlt.common.schema.utils import get_top_level_table +from dlt.common.pipeline import ( + LoadInfo, + LoadMetrics, + SupportsPipeline, + WithStepInfo, +) +from dlt.common.schema.utils import get_child_tables, get_top_level_table from dlt.common.storages.load_storage import LoadPackageInfo, ParsedLoadJobFileName, TJobState +from dlt.common.storages.load_package import LoadPackageStateInjectableContext from dlt.common.runners import TRunMetrics, Runnable, workermethod, NullExecutor from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.runtime.logger import pretty_format_exception @@ -19,7 +26,10 @@ DestinationTerminalException, DestinationTransientException, ) +from dlt.common.configuration.container import Container + from dlt.common.schema import Schema, TSchemaTables + from dlt.common.storages import LoadStorage from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, @@ -34,6 +44,7 @@ SupportsStagingDestination, TDestination, ) +from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.destinations.job_impl import EmptyLoadJob @@ -414,7 +425,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: failed_job.job_file_info.job_id(), failed_job.failed_message, ) - # possibly raise on too many retires + # possibly raise on too many retries if self.config.raise_on_max_retries: for new_job in package_info.jobs["new_jobs"]: r_c = new_job.job_file_info.retry_count @@ -452,12 +463,19 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: schema = self.load_storage.normalized_packages.load_schema(load_id) logger.info(f"Loaded schema name {schema.name} and version {schema.stored_version}") + container = Container() # get top load id and mark as being processed with self.collector(f"Load {schema.name} in {load_id}"): - # the same load id may be processed across multiple runs - if not self.current_load_id: - self._step_info_start_load_id(load_id) - self.load_single_package(load_id, schema) + with container.injectable_context( + LoadPackageStateInjectableContext( + storage=self.load_storage.normalized_packages, + load_id=load_id, + ) + ): + # the same load id may be processed across multiple runs + if not self.current_load_id: + self._step_info_start_load_id(load_id) + self.load_single_package(load_id, schema) return TRunMetrics(False, len(self.load_storage.list_normalized_packages())) diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 56d38a5a64..8565a5d2b2 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -74,7 +74,7 @@ def _filter_columns( return row def _normalize_chunk( - self, root_table_name: str, items: List[TDataItem], may_have_pua: bool + self, root_table_name: str, items: List[TDataItem], may_have_pua: bool, skip_write: bool ) -> TSchemaUpdate: column_schemas = self._column_schemas schema_update: TSchemaUpdate = {} @@ -172,9 +172,11 @@ def _normalize_chunk( # store row # TODO: store all rows for particular items all together after item is fully completed # will be useful if we implement bad data sending to a table - self.load_storage.write_data_item( - self.load_id, schema_name, table_name, row, columns - ) + # we skip write when discovering schema for empty file + if not skip_write: + self.load_storage.write_data_item( + self.load_id, schema_name, table_name, row, columns + ) except StopIteration: pass signals.raise_if_signalled() @@ -193,22 +195,31 @@ def __call__( line: bytes = None for line_no, line in enumerate(f): items: List[TDataItem] = json.loadb(line) - partial_update = self._normalize_chunk(root_table_name, items, may_have_pua(line)) + partial_update = self._normalize_chunk( + root_table_name, items, may_have_pua(line), skip_write=False + ) schema_updates.append(partial_update) logger.debug(f"Processed {line_no} lines from file {extracted_items_file}") if line is None and root_table_name in self.schema.tables: - # write only if table seen data before + # TODO: we should push the truncate jobs via package state + # not as empty jobs. empty jobs should be reserved for + # materializing schemas and other edge cases ie. empty parquet files root_table = self.schema.tables[root_table_name] - if has_table_seen_data(root_table): - self.load_storage.write_empty_items_file( - self.load_id, - self.schema.name, - root_table_name, - self.schema.get_table_columns(root_table_name), - ) - logger.debug( - f"No lines in file {extracted_items_file}, written empty load job file" + if not has_table_seen_data(root_table): + # if this is a new table, add normalizer columns + partial_update = self._normalize_chunk( + root_table_name, [{}], False, skip_write=True ) + schema_updates.append(partial_update) + self.load_storage.write_empty_items_file( + self.load_id, + self.schema.name, + root_table_name, + self.schema.get_table_columns(root_table_name), + ) + logger.debug( + f"No lines in file {extracted_items_file}, written empty load job file" + ) return schema_updates diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index c5762af680..18f8faaa25 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -304,7 +304,7 @@ def spool_files( # drop evolve once for all tables that seen data x_normalizer.pop("evolve-columns-once", None) # mark that table have seen data only if there was data - if table_metrics[table_name].items_count > 0 and "seen-data" not in x_normalizer: + if "seen-data" not in x_normalizer: logger.info( f"Table {table_name} has seen data for a first time with load id {load_id}" ) diff --git a/dlt/pipeline/current.py b/dlt/pipeline/current.py index 7fdc0f095c..25fd398623 100644 --- a/dlt/pipeline/current.py +++ b/dlt/pipeline/current.py @@ -2,6 +2,13 @@ from dlt.common.pipeline import source_state as _state, resource_state, get_current_pipe_name from dlt.pipeline import pipeline as _pipeline +from dlt.extract.decorators import get_source_schema +from dlt.common.storages.load_package import ( + load_package, + commit_load_package_state, + destination_state, + clear_destination_state, +) from dlt.extract.decorators import get_source_schema, get_source pipeline = _pipeline diff --git a/dlt/pipeline/exceptions.py b/dlt/pipeline/exceptions.py index ac203d95a0..d3538a8377 100644 --- a/dlt/pipeline/exceptions.py +++ b/dlt/pipeline/exceptions.py @@ -90,7 +90,9 @@ def __init__( super().__init__( pipeline_name, f"No engine upgrade path for state in pipeline {pipeline_name} from {init_engine} to" - f" {to_engine}, stopped at {from_engine}", + f" {to_engine}, stopped at {from_engine}. You possibly tried to run an older dlt" + " version against a destination you have previously loaded data to with a newer dlt" + " version.", ) diff --git a/dlt/pipeline/mark.py b/dlt/pipeline/mark.py index 0aba0e19ae..3956d9bbe2 100644 --- a/dlt/pipeline/mark.py +++ b/dlt/pipeline/mark.py @@ -1,2 +1,7 @@ """Module with mark functions that make data to be specially processed""" -from dlt.extract import with_table_name, with_hints, make_hints +from dlt.extract import ( + with_table_name, + with_hints, + make_hints, + materialize_schema_item as materialize_table_schema, +) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 185a11962a..d1d558b3b8 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -47,7 +47,7 @@ ) from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.exceptions import LoadPackageNotFound -from dlt.common.typing import DictStrStr, TFun, TSecretValue, is_optional_type +from dlt.common.typing import DictStrAny, TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner from dlt.common.storages import ( LiveSchemaStorage, @@ -126,15 +126,17 @@ ) from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.state_sync import ( - STATE_ENGINE_VERSION, - bump_version_if_modified, - load_state_from_destination, - migrate_state, + PIPELINE_STATE_ENGINE_VERSION, + bump_pipeline_state_version_if_modified, + load_pipeline_state_from_destination, + migrate_pipeline_state, state_resource, json_encode_state, json_decode_state, + default_pipeline_state, ) from dlt.pipeline.warnings import credentials_argument_deprecated +from dlt.common.storages.load_package import TLoadPackageState def with_state_sync(may_extract_state: bool = False) -> Callable[[TFun], TFun]: @@ -143,6 +145,7 @@ def decorator(f: TFun) -> TFun: def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: # activate pipeline so right state is always provided self.activate() + # backup and restore state should_extract_state = may_extract_state and self.config.restore_from_destination with self.managed_state(extract_state=should_extract_state) as state: @@ -263,7 +266,14 @@ class Pipeline(SupportsPipeline): STATE_FILE: ClassVar[str] = "state.json" STATE_PROPS: ClassVar[List[str]] = list( set(get_type_hints(TPipelineState).keys()) - - {"sources", "destination_type", "destination_name", "staging_type", "staging_name"} + - { + "sources", + "destination_type", + "destination_name", + "staging_type", + "staging_name", + "destinations", + } ) LOCAL_STATE_PROPS: ClassVar[List[str]] = list(get_type_hints(TPipelineLocalState).keys()) DEFAULT_DATASET_SUFFIX: ClassVar[str] = "_dataset" @@ -438,6 +448,7 @@ def normalize( """Normalizes the data prepared with `extract` method, infers the schema and creates load packages for the `load` method. Requires `destination` to be known.""" if is_interactive(): workers = 1 + if loader_file_format and loader_file_format in INTERNAL_LOADER_FILE_FORMATS: raise ValueError(f"{loader_file_format} is one of internal dlt file formats.") # check if any schema is present, if not then no data was extracted @@ -446,6 +457,7 @@ def normalize( # make sure destination capabilities are available self._get_destination_capabilities() + # create default normalize config normalize_config = NormalizeConfiguration( workers=workers, @@ -745,7 +757,7 @@ def sync_destination( # write the state back self._props_to_state(state) - bump_version_if_modified(state) + bump_pipeline_state_version_if_modified(state) self._save_state(state) except Exception as ex: raise PipelineStepFailed(self, "sync", None, ex, None) from ex @@ -845,6 +857,10 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: except LoadPackageNotFound: return self._get_normalize_storage().extracted_packages.get_load_package_info(load_id) + def get_load_package_state(self, load_id: str) -> TLoadPackageState: + """Returns information on extracted/normalized/completed package with given load_id, all jobs and their statuses.""" + return self._get_load_storage().get_load_package_state(load_id) + def list_failed_jobs_in_package(self, load_id: str) -> Sequence[LoadJobInfo]: """List all failed jobs and associated error messages for a specified `load_id`""" return self._get_load_storage().get_load_package_info(load_id).jobs.get("failed_jobs", []) @@ -1365,16 +1381,15 @@ def _get_step_info(self, step: WithStepInfo[TStepMetrics, TStepInfo]) -> TStepIn def _get_state(self) -> TPipelineState: try: state = json_decode_state(self._pipeline_storage.load(Pipeline.STATE_FILE)) - return migrate_state( - self.pipeline_name, state, state["_state_engine_version"], STATE_ENGINE_VERSION + return migrate_pipeline_state( + self.pipeline_name, + state, + state["_state_engine_version"], + PIPELINE_STATE_ENGINE_VERSION, ) except FileNotFoundError: # do not set the state hash, this will happen on first merge - return { - "_state_version": 0, - "_state_engine_version": STATE_ENGINE_VERSION, - "_local": {"first_run": True}, - } + return default_pipeline_state() # state["_version_hash"] = generate_version_hash(state) # return state @@ -1404,7 +1419,7 @@ def _restore_state_from_destination(self) -> Optional[TPipelineState]: schema = Schema(schema_name) with self._get_destination_clients(schema)[0] as job_client: if isinstance(job_client, WithStateSync): - state = load_state_from_destination(self.pipeline_name, job_client) + state = load_pipeline_state_from_destination(self.pipeline_name, job_client) if state is None: logger.info( "The state was not found in the destination" @@ -1538,7 +1553,7 @@ def _bump_version_and_extract_state( Storage will be created on demand. In that case the extracted package will be immediately committed. """ - _, hash_, _ = bump_version_if_modified(self._props_to_state(state)) + _, hash_, _ = bump_pipeline_state_version_if_modified(self._props_to_state(state)) should_extract = hash_ != state["_local"].get("_last_extracted_hash") if should_extract and extract_state: data = state_resource(state) diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index fa3939969b..8c72a218a4 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -1,25 +1,28 @@ -import base64 import binascii from copy import copy -import hashlib -from typing import Tuple, cast +from typing import Tuple, cast, List import pendulum import dlt from dlt.common import json -from dlt.common.pipeline import TPipelineState from dlt.common.typing import DictStrAny from dlt.common.schema.typing import STATE_TABLE_NAME, TTableSchemaColumns from dlt.common.destination.reference import WithStateSync, Destination from dlt.common.utils import compressed_b64decode, compressed_b64encode +from dlt.common.versioned_state import ( + generate_state_version_hash, + bump_state_version_if_modified, + default_versioned_state, +) +from dlt.common.pipeline import TPipelineState from dlt.extract import DltResource -from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException +from dlt.pipeline.exceptions import ( + PipelineStateEngineNoUpgradePathException, +) - -# allows to upgrade state when restored with a new version of state logic/schema -STATE_ENGINE_VERSION = 4 +PIPELINE_STATE_ENGINE_VERSION = 4 # state table columns STATE_TABLE_COLUMNS: TTableSchemaColumns = { @@ -57,59 +60,15 @@ def decompress_state(state_str: str) -> DictStrAny: return json.typed_loadb(state_bytes) # type: ignore[no-any-return] -def generate_version_hash(state: TPipelineState) -> str: - # generates hash out of stored schema content, excluding hash itself, version and local state - state_copy = copy(state) - state_copy.pop("_state_version", None) - state_copy.pop("_state_engine_version", None) - state_copy.pop("_version_hash", None) - state_copy.pop("_local", None) - content = json.typed_dumpb(state_copy, sort_keys=True) - h = hashlib.sha3_256(content) - return base64.b64encode(h.digest()).decode("ascii") - +def generate_pipeline_state_version_hash(state: TPipelineState) -> str: + return generate_state_version_hash(state, exclude_attrs=["_local"]) -def bump_version_if_modified(state: TPipelineState) -> Tuple[int, str, str]: - """Bumps the `state` version and version hash if content modified, returns (new version, new hash, old hash) tuple""" - hash_ = generate_version_hash(state) - previous_hash = state.get("_version_hash") - if not previous_hash: - # if hash was not set, set it without bumping the version, that's initial schema - pass - elif hash_ != previous_hash: - state["_state_version"] += 1 - state["_version_hash"] = hash_ - return state["_state_version"], hash_, previous_hash +def bump_pipeline_state_version_if_modified(state: TPipelineState) -> Tuple[int, str, str]: + return bump_state_version_if_modified(state, exclude_attrs=["_local"]) -def state_resource(state: TPipelineState) -> DltResource: - state = copy(state) - state.pop("_local") - state_str = compress_state(state) - state_doc = { - "version": state["_state_version"], - "engine_version": state["_state_engine_version"], - "pipeline_name": state["pipeline_name"], - "state": state_str, - "created_at": pendulum.now(), - "version_hash": state["_version_hash"], - } - return dlt.resource( - [state_doc], name=STATE_TABLE_NAME, write_disposition="append", columns=STATE_TABLE_COLUMNS - ) - - -def load_state_from_destination(pipeline_name: str, client: WithStateSync) -> TPipelineState: - # NOTE: if dataset or table holding state does not exist, the sql_client will rise DestinationUndefinedEntity. caller must handle this - state = client.get_stored_state(pipeline_name) - if not state: - return None - s = decompress_state(state.state) - return migrate_state(pipeline_name, s, s["_state_engine_version"], STATE_ENGINE_VERSION) - - -def migrate_state( +def migrate_pipeline_state( pipeline_name: str, state: DictStrAny, from_engine: int, to_engine: int ) -> TPipelineState: if from_engine == to_engine: @@ -119,7 +78,7 @@ def migrate_state( from_engine = 2 if from_engine == 2 and to_engine > 2: # you may want to recompute hash - state["_version_hash"] = generate_version_hash(state) # type: ignore[arg-type] + state["_version_hash"] = generate_pipeline_state_version_hash(state) # type: ignore[arg-type] from_engine = 3 if from_engine == 3 and to_engine > 3: if state.get("destination"): @@ -139,3 +98,41 @@ def migrate_state( ) state["_state_engine_version"] = from_engine return cast(TPipelineState, state) + + +def state_resource(state: TPipelineState) -> DltResource: + state = copy(state) + state.pop("_local") + state_str = compress_state(state) + state_doc = { + "version": state["_state_version"], + "engine_version": state["_state_engine_version"], + "pipeline_name": state["pipeline_name"], + "state": state_str, + "created_at": pendulum.now(), + "version_hash": state["_version_hash"], + } + return dlt.resource( + [state_doc], name=STATE_TABLE_NAME, write_disposition="append", columns=STATE_TABLE_COLUMNS + ) + + +def load_pipeline_state_from_destination( + pipeline_name: str, client: WithStateSync +) -> TPipelineState: + # NOTE: if dataset or table holding state does not exist, the sql_client will rise DestinationUndefinedEntity. caller must handle this + state = client.get_stored_state(pipeline_name) + if not state: + return None + s = decompress_state(state.state) + return migrate_pipeline_state( + pipeline_name, s, s["_state_engine_version"], PIPELINE_STATE_ENGINE_VERSION + ) + + +def default_pipeline_state() -> TPipelineState: + return { + **default_versioned_state(), + "_state_engine_version": PIPELINE_STATE_ENGINE_VERSION, + "_local": {"first_run": True}, + } diff --git a/docs/examples/custom_destination_bigquery/.dlt/config.toml b/docs/examples/custom_destination_bigquery/.dlt/config.toml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/custom_destination_bigquery/.dlt/example.secrets.toml b/docs/examples/custom_destination_bigquery/.dlt/example.secrets.toml new file mode 100644 index 0000000000..a1ed49b2b8 --- /dev/null +++ b/docs/examples/custom_destination_bigquery/.dlt/example.secrets.toml @@ -0,0 +1,9 @@ +# you can just paste services.json as credentials +[destination.bigquery.credentials] +client_email = "" +private_key = "" +project_id = "" +token_uri = "" +refresh_token = "" +client_id = "" +client_secret = "" \ No newline at end of file diff --git a/docs/examples/custom_destination_bigquery/__init__.py b/docs/examples/custom_destination_bigquery/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py new file mode 100644 index 0000000000..624888f70a --- /dev/null +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -0,0 +1,74 @@ +import dlt +import pandas as pd +import pyarrow as pa +from google.cloud import bigquery + +from dlt.common.configuration.specs import GcpServiceAccountCredentials + +# constants +OWID_DISASTERS_URL = ( + "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv" +) +# this table needs to be manually created in your gc account +# format: "your-project.your_dataset.your_table" +BIGQUERY_TABLE_ID = "chat-analytics-rasa-ci.ci_streaming_insert.natural-disasters" + + +# dlt sources +@dlt.resource(name="natural_disasters") +def resource(url: str): + # load pyarrow table with pandas + table = pa.Table.from_pandas(pd.read_csv(url)) + # we add a list type column to demontrate bigquery lists + table = table.append_column( + "tags", + pa.array( + [["disasters", "earthquakes", "floods", "tsunamis"]] * len(table), + pa.list_(pa.string()), + ), + ) + # we add a struct type column to demonstrate bigquery structs + table = table.append_column( + "meta", + pa.array( + [{"loaded_by": "dlt"}] * len(table), + pa.struct([("loaded_by", pa.string())]), + ), + ) + yield table + + +# dlt biquery custom destination +# we can use the dlt provided credentials class +# to retrieve the gcp credentials from the secrets +@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0) +def bigquery_insert( + items, table, credentials: GcpServiceAccountCredentials = dlt.secrets.value +) -> None: + client = bigquery.Client( + credentials.project_id, credentials.to_native_credentials(), location="US" + ) + job_config = bigquery.LoadJobConfig( + autodetect=True, + source_format=bigquery.SourceFormat.PARQUET, + schema_update_options=bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, + ) + # since we have set the batch_size to 0, we get a filepath and can load the file directly + with open(items, "rb") as f: + load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config) + load_job.result() # Waits for the job to complete. + + +if __name__ == "__main__": + # run the pipeline and print load results + pipeline = dlt.pipeline( + pipeline_name="csv_to_bigquery_insert", + destination=bigquery_insert, + dataset_name="mydata", + full_refresh=True, + ) + load_info = pipeline.run(resource(url=OWID_DISASTERS_URL)) + + print(load_info) diff --git a/docs/tools/.env.example b/docs/tools/.env.example new file mode 100644 index 0000000000..dd9d742228 --- /dev/null +++ b/docs/tools/.env.example @@ -0,0 +1 @@ +OPENAI_API_KEY="..." diff --git a/docs/tools/README.md b/docs/tools/README.md new file mode 100644 index 0000000000..28dfba2a44 --- /dev/null +++ b/docs/tools/README.md @@ -0,0 +1,62 @@ +# DLT docs tools + +This is a collection of useful tools to manage our docs. Some of these require additional dependencies not added +to our pyproject.toml in the root dir. To install these with pip, run: + +```sh +pip3 install -r requirements.txt +``` + +from this folder. + +## `check_embedded_snippets.py` +This script find's all embedded snippets in our docs, extracts them and performs the following checks: + +* Snippet must have a valid language set, e.g. ```py +* Snippet must be parseable (works for py, toml, yaml and json snippets) +* Snippet must pass linting (works for py) +* Coming soon: snippet must pass type checking with mypy + +This script is run on CI to ensure code quality in our docs. + +### Usage + +```sh +# Run a full check on all snippets +python check_embedded_snippets.py full + +# Show all available commands and arguments for this script +python check_embedded_snippets.py --help + +# Only run the linting stage +python check_embedded_snippets.py lint + +# Run all stages but only for snippets in files that have the string "walkthrough" in the filepath +# you will probably be using this a lot when working on one doc page +python check_embedded_snippets.py full -f walkthrough + +# Run the parsing stage, but only on snippets 49, 345 and 789 +python check_embedded_snippets.py parse -s 49,345,789 + +# run all checks but with a bit more output to the terminal +python check_embedded_snippets.py full -v +``` + +### Snippet numbers +Each snippet will be assigned an index in the order it is encountered. This is useful during creation of new snippets in the docs to selectively only run a few snippets. These numbers will change as snippets are inserted into the docs. + +## `fix_grammar_gpt.py` +This script will run all (or selected) docs markdown files through the open ai api to correct grammar. You will need to place the open ai key in an `.env` file in this or the root folder. See `.env.example`. We pay for each openai api call, so be a bit considerate of your usage :). It is good to check the grammar on new pages. + +### Usage + +```sh +# Fix all pages +python fix_grammar_gpt.py + +# Fix grammar for all files that have the string "walkthrough" in the filepath +python fix_grammar_gpt.py -f walkthrough + +# Fix grammar for the particular file +python fix_grammar_gpt.py -f ../website/docs/intro.md +``` diff --git a/docs/tools/__init__.py b/docs/tools/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/tools/check_embedded_snippets.py b/docs/tools/check_embedded_snippets.py new file mode 100644 index 0000000000..da27c1aa19 --- /dev/null +++ b/docs/tools/check_embedded_snippets.py @@ -0,0 +1,304 @@ +""" +Walks through all markdown files, finds all code snippets, and checks wether they are parseable. +""" +from typing import List, Dict, Optional + +import os, ast, json, yaml, tomlkit, subprocess, argparse # noqa: I251 +from dataclasses import dataclass +from textwrap import dedent + +import dlt.cli.echo as fmt + +from utils import collect_markdown_files + + +SNIPPET_MARKER = "```" +ALLOWED_LANGUAGES = ["py", "toml", "json", "yaml", "text", "sh", "bat", "sql"] + +LINT_TEMPLATE = "./lint_setup/template.py" +LINT_FILE = "./lint_setup/lint_me.py" + +ENABLE_MYPY = False + + +@dataclass +class Snippet: + index: int + language: str + code: str + file: str + line: int + + def __str__(self) -> str: + return ( + f"Snippet No. {self.index} in {self.file} at line {self.line} with language" + f" {self.language}" + ) + + +def collect_snippets(markdown_files: List[str], verbose: bool) -> List[Snippet]: + """ + Extract all snippets from markdown files + """ + snippets: List[Snippet] = [] + index = 0 + for file in markdown_files: + # go line by line and find all code blocks + with open(file, "r", encoding="utf-8") as f: + current_snippet: Snippet = None + lint_count = 0 + for line in f.readlines(): + lint_count += 1 + if line.strip().startswith(SNIPPET_MARKER): + if current_snippet: + # process snippet + snippets.append(current_snippet) + current_snippet.code = dedent(current_snippet.code) + current_snippet = None + else: + # start new snippet + index += 1 + current_snippet = Snippet( + index=index, + language=line.strip().split(SNIPPET_MARKER)[1] or "unknown", + code="", + file=file, + line=lint_count, + ) + elif current_snippet: + current_snippet.code += line + assert not current_snippet, ( + "It seems that the last snippet in the file was not closed. Please check the file " + + file + ) + + fmt.note(f"Discovered {len(snippets)} snippets") + if verbose: + for lang in ALLOWED_LANGUAGES: + lang_count = len([s for s in snippets if s.language == lang]) + fmt.echo(f"Found {lang_count} snippets marked as {lang}") + if len(snippets) < 100: # sanity check + fmt.error("Found too few snippets. Something went wrong.") + exit(1) + return snippets + + +def filter_snippets(snippets: List[Snippet], files: str, snippet_numbers: str) -> List[Snippet]: + """ + Filter out snippets based on file or snippet number + """ + fmt.secho(fmt.bold("Filtering Snippets")) + filtered_snippets: List[Snippet] = [] + filtered_count = 0 + for snippet in snippets: + if files and (files not in snippet.file): + filtered_count += 1 + continue + elif snippet_numbers and (str(snippet.index) not in snippet_numbers): + filtered_count += 1 + continue + filtered_snippets.append(snippet) + if filtered_count: + fmt.note( + f"{filtered_count} Snippets skipped based on file and snippet number settings." + f" {len(filtered_snippets)} snippets remaining." + ) + else: + fmt.note("0 Snippets skipped based on file and snippet number settings") + + if len(filtered_snippets) == 0: # sanity check + fmt.error("No snippets remaining after filter, nothing to do.") + exit(1) + return filtered_snippets + + +def check_language(snippets: List[Snippet]) -> None: + """ + Check if the language is allowed + """ + fmt.secho(fmt.bold("Checking snippets language settings")) + failed_count = 0 + for snippet in snippets: + if snippet.language not in ALLOWED_LANGUAGES: + fmt.warning(f"{str(snippet)} has an invalid language {snippet.language} setting.") + failed_count += 1 + + if failed_count: + fmt.error(f"""\ +Found {failed_count} snippets with invalid language settings. +* Please choose the correct language for your snippets: {ALLOWED_LANGUAGES}" +* All sh commands, except for windows (bat), should be marked as sh. +* All code blocks that are not a specific (markup-) language should be marked as text.\ +""") + exit(1) + else: + fmt.note("All snippets have valid language settings") + + +def parse_snippets(snippets: List[Snippet], verbose: bool) -> None: + """ + Parse all snippets with the respective parser library + """ + fmt.secho(fmt.bold("Parsing snippets")) + failed_count = 0 + for snippet in snippets: + # parse snippet by type + if verbose: + fmt.echo(f"Parsing {snippet}") + try: + if snippet.language == "py": + ast.parse(snippet.code) + elif snippet.language == "toml": + tomlkit.loads(snippet.code) + elif snippet.language == "json": + json.loads(snippet.code) + elif snippet.language == "yaml": + yaml.safe_load(snippet.code) + # ignore text and sh scripts + elif snippet.language in ["text", "sh", "bat", "sql"]: + pass + else: + raise ValueError(f"Unknown language {snippet.language}") + except Exception as exc: + fmt.warning(f"Failed to parse {str(snippet)}") + fmt.echo(exc) + failed_count += 1 + + if failed_count: + fmt.error(f"Failed to parse {failed_count} snippets") + exit(1) + else: + fmt.note("All snippets could be parsed") + + +def prepare_for_linting(snippet: Snippet) -> None: + """ + Prepare the lintme file with the snippet code and the template header + """ + with open(LINT_TEMPLATE, "r", encoding="utf-8") as f: + lint_template = f.read() + with open(LINT_FILE, "w", encoding="utf-8") as f: + f.write(lint_template) + f.write("# Snippet start\n\n") + f.write(snippet.code) + + +def lint_snippets(snippets: List[Snippet], verbose: bool) -> None: + """ + Lint all python snippets with ruff + """ + fmt.secho(fmt.bold("Linting Python snippets")) + failed_count = 0 + count = 0 + for snippet in snippets: + count += 1 + prepare_for_linting(snippet) + result = subprocess.run(["ruff", "check", LINT_FILE], capture_output=True, text=True) + if verbose: + fmt.echo(f"Linting {snippet} ({count} of {len(snippets)})") + if "error" in result.stdout.lower(): + failed_count += 1 + fmt.warning(f"Failed to lint {str(snippet)}") + fmt.echo(result.stdout.strip()) + + if failed_count: + fmt.error(f"Failed to lint {failed_count} snippets") + exit(1) + else: + fmt.note("All snippets could be linted") + + +def typecheck_snippets(snippets: List[Snippet], verbose: bool) -> None: + """ + TODO: Type check all python snippets with mypy + """ + fmt.secho(fmt.bold("Type checking Python snippets")) + failed_count = 0 + count = 0 + for snippet in snippets: + count += 1 + if verbose: + fmt.echo(f"Type checking {snippet} ({count} of {len(snippets)})") + prepare_for_linting(snippet) + result = subprocess.run(["mypy", LINT_FILE], capture_output=True, text=True) + if "no issues found" not in result.stdout.lower(): + failed_count += 1 + fmt.warning(f"Failed to type check {str(snippet)}") + fmt.echo(result.stdout.strip()) + + if failed_count: + fmt.error(f"Failed to type check {failed_count} snippets") + exit(1) + else: + fmt.note("All snippets passed type checking") + + +if __name__ == "__main__": + fmt.note( + "Welcome to Snippet Checker 3000, run 'python check_embedded_snippets.py --help' for help." + ) + + # setup cli + parser = argparse.ArgumentParser( + description=( + "Check embedded snippets. Discover, parse, lint, and type check all code snippets in" + " the docs." + ), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "command", + help=( + 'Which checks to run. "full" will run all checks, parse, lint or typecheck will only' + " run that specific step" + ), + choices=["full", "parse", "lint", "typecheck"], + default="full", + ) + parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") + parser.add_argument( + "-f", + "--files", + help="Filter .md files to files containing this string in filename", + type=str, + ) + parser.add_argument( + "-s", + "--snippetnumbers", + help=( + "Filter checked snippets to snippetnumbers contained in this string, example:" + ' "13,412,345"' + ), + type=lambda i: i.split(","), + default=None, + ) + + args = parser.parse_args() + + fmt.secho(fmt.bold("Discovering snippets")) + + # find all markdown files and collect all snippets + markdown_files = collect_markdown_files(args.verbose) + snippets = collect_snippets(markdown_files, args.verbose) + + # check language settings + check_language(snippets) + + # filter snippets + filtered_snippets = filter_snippets(snippets, args.files, args.snippetnumbers) + + if args.command in ["parse", "full"]: + parse_snippets(filtered_snippets, args.verbose) + + # these stages are python only + python_snippets = [s for s in filtered_snippets if s.language == "py"] + if args.command in ["lint", "full"]: + lint_snippets(python_snippets, args.verbose) + if ENABLE_MYPY and args.command in ["typecheck", "full"]: + typecheck_snippets(python_snippets, args.verbose) + + # unlink lint_me file + if os.path.exists(LINT_FILE): + os.unlink(LINT_FILE) + + fmt.note("All selected checks passed. Snippet Checker 3000 signing off.") diff --git a/docs/tools/fix_grammar_gpt.py b/docs/tools/fix_grammar_gpt.py new file mode 100644 index 0000000000..1e4cf748dd --- /dev/null +++ b/docs/tools/fix_grammar_gpt.py @@ -0,0 +1,86 @@ +""" +Fixes the grammar of all the markdown files in the docs/website/docs directory. +Required openai package to be installed, and an .env file with the open ai api key to be present in the root directory: +OPENAI_API_KEY="..." +""" +import os +import argparse + +from openai import OpenAI +from dotenv import load_dotenv + +import dlt.cli.echo as fmt + +from utils import collect_markdown_files + +# constants +BASE_DIR = "../website/docs" +GPT_MODEL = "gpt-3.5-turbo-0125" + +SYSTEM_PROMPT = """\ +You are a grammar checker. Every message you get will be a document that is to be grammarchecked and returned as such. +You will not change the markdown syntax. You will only fix the grammar. You will not change the code snippets except for the comments therein. +You will not modify the header section which is enclosed by two occurences of "---". +Do not change the spelling or casing of these words: dlt, sdf, dbt +""" + +if __name__ == "__main__": + load_dotenv() + + fmt.note("Welcome to Grammar Fixer 3000, run 'python fix_grammar_gpt.py --help' for help.") + + # setup cli + parser = argparse.ArgumentParser( + description=( + "Fixes the grammar of our docs with open ai. Requires an .env file with the open ai" + " key." + ), + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true") + parser.add_argument( + "-f", + "--files", + help="Specify the file name. Grammar Checker will filter all .md files containing this string in the filepath.", + type=str, + ) + + # get args + args = parser.parse_args() + + # find all files + markdown_files = collect_markdown_files(args.verbose) + + # filter files + if args.files: + markdown_files = [f for f in markdown_files if args.files in f] + + # run grammar check + count = 0 + for file_path in markdown_files: + count += 1 + + fmt.note(f"Fixing grammar for file {file_path} ({count} of {len(markdown_files)})") + + with open(file_path, "r", encoding="utf-8") as f: + doc = f.readlines() + + client = OpenAI() + response = client.chat.completions.create( + model=GPT_MODEL, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": "".join(doc)}, + ], + temperature=0, + ) + + fixed_doc = response.choices[0].message.content + + with open(file_path, "w", encoding="utf-8") as f: + f.writelines(fixed_doc) + + if count == 0: + fmt.warning("No files selected for grammar check.") + else: + fmt.note(f"Fixed grammar for {count} files.") diff --git a/docs/tools/lint_setup/.gitignore b/docs/tools/lint_setup/.gitignore new file mode 100644 index 0000000000..27479bdb04 --- /dev/null +++ b/docs/tools/lint_setup/.gitignore @@ -0,0 +1 @@ +lint_me.py \ No newline at end of file diff --git a/docs/tools/lint_setup/__init__.py b/docs/tools/lint_setup/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/tools/lint_setup/template.py b/docs/tools/lint_setup/template.py new file mode 100644 index 0000000000..dcfada63f6 --- /dev/null +++ b/docs/tools/lint_setup/template.py @@ -0,0 +1,35 @@ +# This section is imported before linting + +# mypy: disable-error-code="name-defined,import-not-found,import-untyped,empty-body,no-redef" + +# some universal imports +from typing import Optional, Dict, List, Any, Iterable, Iterator, Tuple, Sequence, Callable + +import os + +import pendulum +from pendulum import DateTime +from datetime import datetime # noqa: I251 + +import dlt +from dlt.common import json +from dlt.common.typing import TimedeltaSeconds, TAnyDateTime, TDataItem, TDataItems +from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns + +from dlt.common.pipeline import LoadInfo +from dlt.sources.helpers import requests +from dlt.extract import DltResource, DltSource +from dlt.common.configuration.specs import ( + GcpServiceAccountCredentials, + ConnectionStringCredentials, + OAuth2Credentials, + BaseConfiguration, +) +from dlt.common.storages.configuration import FileSystemCredentials + +# some universal variables +pipeline: dlt.Pipeline = None # type: ignore[assignment] +p: dlt.Pipeline = None # type: ignore[assignment] +ex: Exception = None # type: ignore[assignment] +load_info: LoadInfo = None # type: ignore[assignment] +url: str = None # type: ignore[assignment] diff --git a/docs/tools/mypy.ini b/docs/tools/mypy.ini new file mode 100644 index 0000000000..167ad5b30e --- /dev/null +++ b/docs/tools/mypy.ini @@ -0,0 +1,4 @@ +[mypy] +ignore_missing_imports = True +no_implicit_optional = False +strict_optional = False \ No newline at end of file diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt new file mode 100644 index 0000000000..48db2b38da --- /dev/null +++ b/docs/tools/requirements.txt @@ -0,0 +1,2 @@ +python-dotenv==1.0.1 +openai==1.14.2 diff --git a/docs/tools/ruff.toml b/docs/tools/ruff.toml new file mode 100644 index 0000000000..96f9432ecc --- /dev/null +++ b/docs/tools/ruff.toml @@ -0,0 +1,2 @@ +[lint] +ignore = ["F811", "F821", "F401", "F841", "E402"] diff --git a/docs/tools/utils.py b/docs/tools/utils.py new file mode 100644 index 0000000000..074b19b8e1 --- /dev/null +++ b/docs/tools/utils.py @@ -0,0 +1,32 @@ +from typing import List +import os + +import dlt.cli.echo as fmt + + +DOCS_DIR = "../website/docs" + + +def collect_markdown_files(verbose: bool) -> List[str]: + """ + Discovers all docs markdown files + """ + markdown_files: List[str] = [] + for path, _, files in os.walk(DOCS_DIR): + if "api_reference" in path: + continue + if "jaffle_shop" in path: + continue + for file in files: + if file.endswith(".md"): + markdown_files.append(os.path.join(path, file)) + if verbose: + fmt.echo(f"Discovered {os.path.join(path, file)}") + + if len(markdown_files) < 50: # sanity check + fmt.error("Found too few files. Something went wrong.") + exit(1) + + fmt.note(f"Discovered {len(markdown_files)} markdown files") + + return markdown_files diff --git a/docs/website/blog/2023-10-09-dlt-ops-startups.md b/docs/website/blog/2023-10-09-dlt-ops-startups.md index c48fd9ed95..94c1ff662b 100644 --- a/docs/website/blog/2023-10-09-dlt-ops-startups.md +++ b/docs/website/blog/2023-10-09-dlt-ops-startups.md @@ -112,7 +112,7 @@ Customize the INVOICE_QUERIES dictionary in the `unstructured_data/settings.py` And now the magic happens. Use the following command to run the pipeline: -```shell +```sh python unstructured_data_pipeline.py ``` diff --git a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md index 227c466d37..b36748aed9 100644 --- a/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md +++ b/docs/website/blog/2024-01-15-dlt-dbt-runner-on-cloud-functions.md @@ -132,7 +132,7 @@ We recommend setting up and testing dbt-core locally before using it in cloud fu 1. Finally, you can deploy the function using gcloud CLI as: - ```shell + ```sh gcloud functions deploy YOUR_FUNCTION_NAME \ --gen2 \ --region=YOUR_REGION \ @@ -313,7 +313,7 @@ To integrate dlt and dbt in cloud functions, use the dlt-dbt runner; here’s ho 1. Finally, you can deploy the function using gcloud CLI as: - ```shell + ```sh gcloud functions deploy YOUR_FUNCTION_NAME \ --gen2 \ --region=YOUR_REGION \ diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index 90a175777f..1522761609 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -36,7 +36,7 @@ scalable extraction via micro-batching and parallelism. ## The simplest pipeline: 1 liner to load data with schema evolution -```python +```py import dlt dlt.pipeline(destination='duckdb', dataset_name='mydata').run([{'id': 1, 'name': 'John'}], table_name="users") @@ -52,7 +52,7 @@ named "three". With `dlt`, you can create a pipeline and run it with just a few 1. [Create a pipeline](walkthroughs/create-a-pipeline.md) to the [destination](dlt-ecosystem/destinations). 1. Give this pipeline data and [run it](walkthroughs/run-a-pipeline.md). -```python +```py import dlt pipeline = dlt.pipeline(destination="duckdb", dataset_name="country_data") @@ -84,7 +84,7 @@ In this example, we also run a dbt package and then load the outcomes of the loa This will enable us to log when schema changes occurred and match them to the loaded data for lineage, granting us both column and row level lineage. We also alert the schema change to a Slack channel where hopefully the producer and consumer are subscribed. -```python +```py import dlt # have data? dlt likes data @@ -105,7 +105,7 @@ load_info = pipeline.run( ) ``` Add dbt runner, optionally with venv: -```python +```py venv = dlt.dbt.get_venv(pipeline) dbt = dlt.dbt.package( pipeline, @@ -122,7 +122,7 @@ pipeline.run([models_info], table_name="transform_status", write_disposition='ap ``` Let's alert any schema changes: -```python +```py from dlt.common.runtime.slack import send_slack_message slack_hook = "https://hooks.slack.com/services/xxx/xxx/xxx" @@ -211,7 +211,7 @@ that only one instance of each event is present. You can use the merge write disposition as follows: -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def github_repo_events(): yield from _get_event_pages() @@ -260,7 +260,7 @@ into DAGs, providing cross-database compatibility and various features such as t backfills, testing, and troubleshooting. You can use the dbt runner in `dlt` to seamlessly integrate dbt into your pipeline. Here's an example of running a dbt package after loading the data: -```python +```py import dlt from pipedrive import pipedrive_source @@ -275,7 +275,7 @@ load_info = pipeline.run(pipedrive_source()) print(load_info) ``` Now transform from loaded data to dbt dataset: -```python +```py pipeline = dlt.pipeline( pipeline_name='pipedrive', destination='bigquery', @@ -306,7 +306,7 @@ transformations using SQL statements. You can execute SQL statements that change or manipulate data within tables. Here's an example of inserting a row into the `customers` table using the `dlt` SQL client: -```python +```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") with pipeline.sql_client() as client: @@ -324,7 +324,7 @@ You can fetch query results as Pandas data frames and perform transformations us functionalities. Here's an example of reading data from the `issues` table in DuckDB and counting reaction types using Pandas: -```python +```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 9fc5dc15f9..26be75869b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -6,46 +6,45 @@ keywords: [aws, athena, glue catalog] # AWS Athena / Glue Catalog -The athena destination stores data as parquet files in s3 buckets and creates [external tables in aws athena](https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html). You can then query those tables with athena sql commands which will then scan the whole folder of parquet files and return the results. This destination works very similar to other sql based destinations, with the exception of the merge write disposition not being supported at this time. dlt metadata will be stored in the same bucket as the parquet files, but as iceberg tables. Athena additionally supports writing individual data tables as iceberg tables, so the may be manipulated later, a common use-case would be to strip gdpr data from them. +The Athena destination stores data as Parquet files in S3 buckets and creates [external tables in AWS Athena](https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html). You can then query those tables with Athena SQL commands, which will scan the entire folder of Parquet files and return the results. This destination works very similarly to other SQL-based destinations, with the exception that the merge write disposition is not supported at this time. The `dlt` metadata will be stored in the same bucket as the Parquet files, but as iceberg tables. Athena also supports writing individual data tables as Iceberg tables, so they may be manipulated later. A common use case would be to strip GDPR data from them. ## Install dlt with Athena **To install the DLT library with Athena dependencies:** -``` +```sh pip install dlt[athena] ``` ## Setup Guide ### 1. Initialize the dlt project -Let's start by initializing a new dlt project as follows: - ```bash +Let's start by initializing a new `dlt` project as follows: + ```sh dlt init chess athena ``` - > 💡 This command will initialise your pipeline with chess as the source and aws athena as the destination using the filesystem staging destination + > 💡 This command will initialize your pipeline with chess as the source and AWS Athena as the destination using the filesystem staging destination. -### 2. Setup bucket storage and athena credentials +### 2. Setup bucket storage and Athena credentials -First install dependencies by running: -``` +First, install dependencies by running: +```sh pip install -r requirements.txt ``` -or with `pip install dlt[athena]` which will install `s3fs`, `pyarrow`, `pyathena` and `botocore` packages. +or with `pip install dlt[athena]`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages. :::caution -You may also install the dependencies independently -try +You may also install the dependencies independently. Try ```sh pip install dlt pip install s3fs pip install pyarrow pip install pyathena ``` -so pip does not fail on backtracking +so pip does not fail on backtracking. ::: -To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You will need to provide a `bucket_url` which holds the uploaded parquet files, a `query_result_bucket` which athena uses to write query results too, and credentials that have write and read access to these two buckets as well as the full athena access aws role. +To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You will need to provide a `bucket_url`, which holds the uploaded parquet files, a `query_result_bucket`, which Athena uses to write query results to, and credentials that have write and read access to these two buckets as well as the full Athena access AWS role. The toml file looks like this: @@ -63,10 +62,10 @@ query_result_bucket="s3://[results_bucket_name]" # replace with your query resul [destination.athena.credentials] aws_access_key_id="please set me up!" # same as credentials for filesystem aws_secret_access_key="please set me up!" # same as credentials for filesystem -region_name="please set me up!" # set your aws region, for example "eu-central-1" for frankfurt +region_name="please set me up!" # set your AWS region, for example "eu-central-1" for Frankfurt ``` -if you have your credentials stored in `~/.aws/credentials` just remove the **[destination.filesystem.credentials]** and **[destination.athena.credentials]** section above and `dlt` will fall back to your **default** profile in local credentials. If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): +If you have your credentials stored in `~/.aws/credentials`, just remove the **[destination.filesystem.credentials]** and **[destination.athena.credentials]** section above and `dlt` will fall back to your **default** profile in local credentials. If you want to switch the profile, pass the profile name as follows (here: `dlt-ci-user`): ```toml [destination.filesystem.credentials] profile_name="dlt-ci-user" @@ -77,7 +76,7 @@ profile_name="dlt-ci-user" ## Additional Destination Configuration -You can provide an athena workgroup like so: +You can provide an Athena workgroup like so: ```toml [destination.athena] athena_work_group="my_workgroup" @@ -85,75 +84,70 @@ athena_work_group="my_workgroup" ## Write disposition -`athena` destination handles the write dispositions as follows: -- `append` - files belonging to such tables are added to dataset folder -- `replace` - all files that belong to such tables are deleted from dataset folder and then current set of files is added. -- `merge` - falls back to `append` +The `athena` destination handles the write dispositions as follows: +- `append` - files belonging to such tables are added to the dataset folder. +- `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added. +- `merge` - falls back to `append`. ## Data loading -Data loading happens by storing parquet files in an s3 bucket and defining a schema on athena. If you query data via SQL queries on athena, the returned data is read by -scanning your bucket and reading all relevant parquet files in there. +Data loading happens by storing parquet files in an S3 bucket and defining a schema on Athena. If you query data via SQL queries on Athena, the returned data is read by scanning your bucket and reading all relevant parquet files in there. `dlt` internal tables are saved as Iceberg tables. ### Data types -Athena tables store timestamps with millisecond precision and with that precision we generate parquet files. Mind that Iceberg tables have microsecond precision. +Athena tables store timestamps with millisecond precision, and with that precision, we generate parquet files. Keep in mind that Iceberg tables have microsecond precision. -Athena does not support JSON fields so JSON is stored as string. +Athena does not support JSON fields, so JSON is stored as a string. > ❗**Athena does not support TIME columns in parquet files**. `dlt` will fail such jobs permanently. Convert `datetime.time` objects to `str` or `datetime.datetime` to load them. ### Naming Convention -We follow our snake_case name convention. Mind the following: -* DDL use HIVE escaping with `````` +We follow our snake_case name convention. Keep the following in mind: +* DDL uses HIVE escaping with `````` * Other queries use PRESTO and regular SQL escaping. ## Staging support -Using a staging destination is mandatory when using the athena destination. If you do not set staging to `filesystem`, dlt will automatically do this for you. +Using a staging destination is mandatory when using the Athena destination. If you do not set staging to `filesystem`, `dlt` will automatically do this for you. If you decide to change the [filename layout](./filesystem#data-loading) from the default value, keep the following in mind so that Athena can reliably build your tables: - - You need to provide the `{table_name}` placeholder and this placeholder needs to be followed by a forward slash - - You need to provide the `{file_id}` placeholder and it needs to be somewhere after the `{table_name}` placeholder. - - {table_name} must be the first placeholder in the layout. + - You need to provide the `{table_name}` placeholder, and this placeholder needs to be followed by a forward slash. + - You need to provide the `{file_id}` placeholder, and it needs to be somewhere after the `{table_name}` placeholder. + - `{table_name}` must be the first placeholder in the layout. ## Additional destination options -### iceberg data tables -You can save your tables as iceberg tables to athena. This will enable you to for example delete data from them later if you need to. To switch a resouce to the iceberg table-format, -supply the table_format argument like this: +### Iceberg data tables +You can save your tables as Iceberg tables to Athena. This will enable you, for example, to delete data from them later if you need to. To switch a resource to the iceberg table format, supply the table_format argument like this: -```python +```py @dlt.resource(table_format="iceberg") def data() -> Iterable[TDataItem]: ... ``` -Alternatively you can set all tables to use the iceberg format with a config variable: +Alternatively, you can set all tables to use the iceberg format with a config variable: ```toml [destination.athena] force_iceberg = "True" ``` -For every table created as an iceberg table, the athena destination will create a regular athena table in the staging dataset of both the filesystem as well as the athena glue catalog and then -copy all data into the final iceberg table that lives with the non-iceberg tables in the same dataset on both filesystem and the glue catalog. Switching from iceberg to regular table or vice versa -is not supported. +For every table created as an iceberg table, the Athena destination will create a regular Athena table in the staging dataset of both the filesystem and the Athena glue catalog, and then copy all data into the final iceberg table that lives with the non-iceberg tables in the same dataset on both the filesystem and the glue catalog. Switching from iceberg to regular table or vice versa is not supported. ### dbt support -Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of generated dbt profile. Iceberg tables are supported but you need to make sure that you materialize your models as iceberg tables if your source table is iceberg. We encountered problems with materializing -date time columns due to different precision on iceberg (nanosecond) and regular Athena tables (millisecond). -The Athena adapter requires that you setup **region_name** in Athena configuration below. You can also setup table catalog name to change the default: **awsdatacatalog** +Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of the generated dbt profile. Iceberg tables are supported, but you need to make sure that you materialize your models as iceberg tables if your source table is iceberg. We encountered problems with materializing date time columns due to different precision on iceberg (nanosecond) and regular Athena tables (millisecond). +The Athena adapter requires that you set up **region_name** in the Athena configuration below. You can also set up the table catalog name to change the default: **awsdatacatalog** ```toml [destination.athena] aws_data_catalog="awsdatacatalog" ``` ### Syncing of `dlt` state -- This destination fully supports [dlt state sync.](../../general-usage/state#syncing-state-with-destination). The state is saved in athena iceberg tables in your s3 bucket. +- This destination fully supports [dlt state sync.](../../general-usage/state#syncing-state-with-destination). The state is saved in Athena iceberg tables in your S3 bucket. ## Supported file formats diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 25b01923b5..4144707b03 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -10,7 +10,7 @@ keywords: [bigquery, destination, data warehouse] **To install the DLT library with BigQuery dependencies:** -``` +```sh pip install dlt[bigquery] ``` @@ -18,17 +18,17 @@ pip install dlt[bigquery] **1. Initialize a project with a pipeline that loads to BigQuery by running:** -``` +```sh dlt init chess bigquery ``` **2. Install the necessary dependencies for BigQuery by running:** -``` +```sh pip install -r requirements.txt ``` -This will install dlt with **bigquery** extra, which contains all the dependencies required by the bigquery client. +This will install dlt with the `bigquery` extra, which contains all the dependencies required by the bigquery client. **3. Log in to or create a Google Cloud account** @@ -58,7 +58,7 @@ You don't need to grant users access to this service account now, so click the ` In the service accounts table page that you're redirected to after clicking `Done` as instructed above, select the three dots under the `Actions` column for the service account you created and select `Manage keys`. -This will take you to page where you can click the `Add key` button, then the `Create new key` button, +This will take you to a page where you can click the `Add key` button, then the `Create new key` button, and finally the `Create` button, keeping the preselected `JSON` option. A `JSON` file that includes your service account private key will then be downloaded. @@ -67,7 +67,7 @@ A `JSON` file that includes your service account private key will then be downlo Open your `dlt` credentials file: -``` +```sh open .dlt/secrets.toml ``` @@ -83,11 +83,11 @@ private_key = "private_key" # please set me up! client_email = "client_email" # please set me up! ``` -You can specify the location of the data i.e. `EU` instead of `US` which is a default. +You can specify the location of the data i.e. `EU` instead of `US` which is the default. ### OAuth 2.0 Authentication -You can use the OAuth 2.0 authentication. You'll need to generate a **refresh token** with right scopes (I suggest to ask our GPT-4 assistant for details). +You can use OAuth 2.0 authentication. You'll need to generate a **refresh token** with the right scopes (we suggest asking our GPT-4 assistant for details). Then you can fill the following information in `secrets.toml` ```toml @@ -103,9 +103,9 @@ refresh_token = "refresh_token" # please set me up! ### Using Default Credentials -Google provides several ways to get default credentials i.e. from `GOOGLE_APPLICATION_CREDENTIALS` environment variable or metadata services. +Google provides several ways to get default credentials i.e. from the `GOOGLE_APPLICATION_CREDENTIALS` environment variable or metadata services. VMs available on GCP (cloud functions, Composer runners, Colab notebooks) have associated service accounts or authenticated users. -Will try to use default credentials if nothing is explicitly specified in the secrets. +`dlt` will try to use default credentials if nothing is explicitly specified in the secrets. ```toml [destination.bigquery] @@ -114,16 +114,16 @@ location = "US" ## Write Disposition -All write dispositions are supported +All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized` the destination tables will be dropped and +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and recreated with a [clone command](https://cloud.google.com/bigquery/docs/table-clones-create) from the staging tables. ## Data Loading -`dlt` uses `BigQuery` load jobs that send files from local filesystem or gcs buckets. -Loader follows [Google recommendations](https://cloud.google.com/bigquery/docs/error-messages) when retrying and terminating jobs. -Google BigQuery client implements elaborate retry mechanism and timeouts for queries and file uploads, which may be configured in destination options. +`dlt` uses `BigQuery` load jobs that send files from the local filesystem or GCS buckets. +The loader follows [Google recommendations](https://cloud.google.com/bigquery/docs/error-messages) when retrying and terminating jobs. +The Google BigQuery client implements an elaborate retry mechanism and timeouts for queries and file uploads, which may be configured in destination options. ## Supported File Formats @@ -143,36 +143,36 @@ When staging is enabled: BigQuery supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): -* `partition` - creates a partition with a day granularity on decorated column (`PARTITION BY DATE`). - May be used with `datetime`, `date` and `bigint` data types. +* `partition` - creates a partition with a day granularity on the decorated column (`PARTITION BY DATE`). + May be used with `datetime`, `date`, and `bigint` data types. Only one column per table is supported and only when a new table is created. For more information on BigQuery partitioning, read the [official docs](https://cloud.google.com/bigquery/docs/partitioned-tables). > ❗ `bigint` maps to BigQuery's **INT64** data type. > Automatic partitioning requires converting an INT64 column to a UNIX timestamp, which `GENERATE_ARRAY` doesn't natively support. > With a 10,000 partition limit, we can’t cover the full INT64 range. - > Instead, we set 86,400 second boundaries to enable daily partitioning. + > Instead, we set 86,400-second boundaries to enable daily partitioning. > This captures typical values, but extremely large/small outliers go to an `__UNPARTITIONED__` catch-all partition. * `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. ## Staging Support -BigQuery supports gcs as a file staging destination. dlt will upload files in the parquet format to gcs and ask BigQuery to copy their data directly into the db. -Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your gcs bucket with the bucket_url and credentials. -If you use the same service account for gcs and your redshift deployment, you do not need to provide additional authentication for BigQuery to be able to read from your bucket. +BigQuery supports GCS as a file staging destination. `dlt` will upload files in the parquet format to GCS and ask BigQuery to copy their data directly into the database. +Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your GCS bucket with the bucket_url and credentials. +If you use the same service account for GCS and your Redshift deployment, you do not need to provide additional authentication for BigQuery to be able to read from your bucket. -Alternatively to parquet files, you can specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. +Alternatively to parquet files, you can specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. ### BigQuery/GCS Staging Example -```python +```py # Create a dlt pipeline that will load # chess player data to the BigQuery destination -# via a gcs bucket. +# via a GCS bucket. pipeline = dlt.pipeline( pipeline_name='chess_pipeline', - destination='biquery', + destination='bigquery', staging='filesystem', # Add this to activate the staging location. dataset_name='player_data' ) @@ -180,7 +180,7 @@ pipeline = dlt.pipeline( ## Additional Destination Options -You can configure the data location and various timeouts as shown below. This information is not a secret so can be placed in `config.toml` as well: +You can configure the data location and various timeouts as shown below. This information is not a secret so it can be placed in `config.toml` as well: ```toml [destination.bigquery] @@ -191,15 +191,15 @@ retry_deadline=60.0 ``` * `location` sets the [BigQuery data location](https://cloud.google.com/bigquery/docs/locations) (default: **US**) -* `http_timeout` sets the timeout when connecting and getting a response from BigQuery API (default: **15 seconds**) -* `file_upload_timeout` a timeout for file upload when loading local files: the total time of the upload may not exceed this value (default: **30 minutes**, set in seconds) -* `retry_deadline` a deadline for a [DEFAULT_RETRY used by Google](https://cloud.google.com/python/docs/reference/storage/1.39.0/retry_timeout) +* `http_timeout` sets the timeout when connecting and getting a response from the BigQuery API (default: **15 seconds**) +* `file_upload_timeout` is a timeout for file upload when loading local files: the total time of the upload may not exceed this value (default: **30 minutes**, set in seconds) +* `retry_deadline` is a deadline for a [DEFAULT_RETRY used by Google](https://cloud.google.com/python/docs/reference/storage/1.39.0/retry_timeout) ### dbt Support This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-bigquery](https://github.com/dbt-labs/dbt-bigquery). Credentials, if explicitly defined, are shared with `dbt` along with other settings like **location** and retries and timeouts. -In case of implicit credentials (i.e. available in cloud function), `dlt` shares the `project_id` and delegates obtaining credentials to `dbt` adapter. +In the case of implicit credentials (i.e. available in a cloud function), `dlt` shares the `project_id` and delegates obtaining credentials to the `dbt` adapter. ### Syncing of `dlt` State @@ -215,9 +215,9 @@ The adapter updates the DltResource with metadata about the destination column a ### Use an Adapter to Apply Hints to a Resource -Here is an example of how to use the `bigquery_adapter` method to apply hints to a resource on both column level and table level: +Here is an example of how to use the `bigquery_adapter` method to apply hints to a resource on both the column level and table level: -```python +```py from datetime import date, timedelta import dlt @@ -246,9 +246,9 @@ bigquery_adapter( bigquery_adapter(event_data, table_description="Dummy event data.") ``` -Above, the adapter specifies that `event_date` should be used for partitioning and both `event_date` and `user_id` should be used for clustering (in the given order) when the table is created. +In the example above, the adapter specifies that `event_date` should be used for partitioning and both `event_date` and `user_id` should be used for clustering (in the given order) when the table is created. -Some things to note with the adapter's behaviour: +Some things to note with the adapter's behavior: - You can only partition on one column (refer to [supported hints](#supported-column-hints)). - You can cluster on as many columns as you would like. @@ -258,7 +258,7 @@ Some things to note with the adapter's behaviour: Note that `bigquery_adapter` updates the resource *inplace*, but returns the resource for convenience, i.e. both the following are valid: -```python +```py bigquery_adapter(my_resource, partition="partition_column_name") my_resource = bigquery_adapter(my_resource, partition="partition_column_name") ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index fc100e41e2..8078d2c64d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -7,11 +7,11 @@ keywords: [Databricks, destination, data warehouse] --- # Databricks -*Big thanks to Evan Phillips and [swishbi.com](https://swishbi.com/) for contributing code, time and test environment* +*Big thanks to Evan Phillips and [swishbi.com](https://swishbi.com/) for contributing code, time, and a test environment.* ## Install dlt with Databricks **To install the DLT library with Databricks dependencies:** -``` +```sh pip install dlt[databricks] ``` @@ -28,7 +28,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade 1. Create a Databricks workspace in Azure - In your Azure Portal search for Databricks and create a new workspace. In the "Pricing Tier" section, select "Premium" to be able to use the Unity Catalog. + In your Azure Portal, search for Databricks and create a new workspace. In the "Pricing Tier" section, select "Premium" to be able to use the Unity Catalog. 2. Create an ADLS Gen 2 storage account @@ -42,7 +42,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade 4. Create an Access Connector for Azure Databricks This will allow Databricks to access your storage account. - In the Azure Portal search for "Access Connector for Azure Databricks" and create a new connector. + In the Azure Portal, search for "Access Connector for Azure Databricks" and create a new connector. 5. Grant access to your storage container @@ -54,16 +54,16 @@ If you already have your Databricks workspace set up, you can skip to the [Loade 1. Now go to your Databricks workspace - To get there from the Azure Portal, search for "Databricks" and select your Databricks and click "Launch Workspace". + To get there from the Azure Portal, search for "Databricks", select your Databricks, and click "Launch Workspace". 2. In the top right corner, click on your email address and go to "Manage Account" 3. Go to "Data" and click on "Create Metastore" Name your metastore and select a region. - If you'd like to set up a storage container for the whole metastore you can add your ADLS URL and Access Connector Id here. You can also do this on a granular level when creating the catalog. + If you'd like to set up a storage container for the whole metastore, you can add your ADLS URL and Access Connector Id here. You can also do this on a granular level when creating the catalog. - In the next step assign your metastore to your workspace. + In the next step, assign your metastore to your workspace. 4. Go back to your workspace and click on "Catalog" in the left-hand menu @@ -77,7 +77,7 @@ If you already have your Databricks workspace set up, you can skip to the [Loade Set the URL of our storage container. This should be in the form: `abfss://@.dfs.core.windows.net/` - Once created you can test the connection to make sure the container is accessible from databricks. + Once created, you can test the connection to make sure the container is accessible from Databricks. 7. Now you can create a catalog @@ -91,12 +91,12 @@ If you already have your Databricks workspace set up, you can skip to the [Loade ## Loader setup Guide **1. Initialize a project with a pipeline that loads to Databricks by running** -``` +```sh dlt init chess databricks ``` **2. Install the necessary dependencies for Databricks by running** -``` +```sh pip install -r requirements.txt ``` This will install dlt with **databricks** extra which contains Databricks Python dbapi client. @@ -113,7 +113,7 @@ Example: [destination.databricks.credentials] server_hostname = "MY_DATABRICKS.azuredatabricks.net" http_path = "/sql/1.0/warehouses/12345" -access_token "MY_ACCESS_TOKEN" +access_token = "MY_ACCESS_TOKEN" catalog = "my_catalog" ``` @@ -123,7 +123,7 @@ All write dispositions are supported ## Data loading Data is loaded using `INSERT VALUES` statements by default. -Efficient loading from a staging filesystem is also supported by configuring an Amazon S3 or Azure Blob Storage bucket as a staging destination. When staging is enabled `dlt` will upload data in `parquet` files to the bucket and then use `COPY INTO` statements to ingest the data into Databricks. +Efficient loading from a staging filesystem is also supported by configuring an Amazon S3 or Azure Blob Storage bucket as a staging destination. When staging is enabled, `dlt` will upload data in `parquet` files to the bucket and then use `COPY INTO` statements to ingest the data into Databricks. For more information on staging, see the [staging support](#staging-support) section below. ## Supported file formats @@ -133,7 +133,7 @@ For more information on staging, see the [staging support](#staging-support) sec The `jsonl` format has some limitations when used with Databricks: -1. Compression must be disabled to load jsonl files in databricks. Set `data_writer.disable_compression` to `true` in dlt config when using this format. +1. Compression must be disabled to load jsonl files in Databricks. Set `data_writer.disable_compression` to `true` in dlt config when using this format. 2. The following data types are not supported when using `jsonl` format with `databricks`: `decimal`, `complex`, `date`, `binary`. Use `parquet` if your data contains these types. 3. `bigint` data type with precision is not supported with `jsonl` format @@ -144,16 +144,16 @@ Databricks supports both Amazon S3 and Azure Blob Storage as staging locations. ### Databricks and Amazon S3 -Please refer to the [S3 documentation](./filesystem.md#aws-s3) for details on connecting your s3 bucket with the bucket_url and credentials. +Please refer to the [S3 documentation](./filesystem.md#aws-s3) for details on connecting your S3 bucket with the bucket_url and credentials. -Example to set up Databricks with s3 as a staging destination: +Example to set up Databricks with S3 as a staging destination: -```python +```py import dlt # Create a dlt pipeline that will load # chess player data to the Databricks destination -# via staging on s3 +# via staging on S3 pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='databricks', @@ -168,7 +168,7 @@ Refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure Example to set up Databricks with Azure as a staging destination: -```python +```py # Create a dlt pipeline that will load # chess player data to the Databricks destination # via staging on Azure Blob Storage @@ -195,4 +195,4 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci - [Load data from Google Analytics to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-databricks) - [Load data from Google Sheets to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-databricks) - [Load data from Chess.com to Databricks in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-databricks) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md new file mode 100644 index 0000000000..60753d90b5 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md @@ -0,0 +1,200 @@ +--- +title: Custom destination +description: Custom `dlt` destination function for reverse ETL +keywords: [reverse etl, sink, function, decorator, destination, custom destination] +--- + +# Custom destination: Reverse ETL + +The `dlt` destination decorator allows you to receive all data passing through your pipeline in a simple function. This can be extremely useful for reverse ETL, where you are pushing data back to an API. + +You can also use this for sending data to a queue or a simple database destination that is not yet supported by `dlt`, although be aware that you will have to manually handle your own migrations in this case. + +It will also allow you to simply get a path to the files of your normalized data. So, if you need direct access to parquet or jsonl files to copy them somewhere or push them to a database, you can do this here too. + +## Install `dlt` for reverse ETL + +To install `dlt` without additional dependencies: +```sh +pip install dlt +``` + +## Set up a destination function for your pipeline + +The custom destination decorator differs from other destinations in that you do not need to provide connection credentials, but rather you provide a function which gets called for all items loaded during a pipeline run or load operation. With the `@dlt.destination`, you can convert any function that takes two arguments into a `dlt` destination. + +A very simple dlt pipeline that pushes a list of items into a destination function might look like this: + +```py +import dlt +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema + +@dlt.destination(batch_size=10) +def my_destination(items: TDataItems, table: TTableSchema) -> None: + print(table["name"]) + print(items) + +pipeline = dlt.pipeline("custom_destination_pipeline", destination=my_destination) +pipeline.run([1, 2, 3], table_name="items") +``` + +:::tip +1. You can also remove the typing information (`TDataItems` and `TTableSchema`) from this example. Typing is generally useful to know the shape of the incoming objects, though. +2. There are a few other ways for declaring custom destination functions for your pipeline described below. +::: + +### `@dlt.destination`, custom destination function, and signature + +The full signature of the destination decorator plus its function is the following: + +```py +@dlt.destination( + batch_size=10, + loader_file_format="jsonl", + name="my_custom_destination", + naming_convention="direct", + max_nesting_level=0, + skip_dlt_columns_and_tables=True +) +def my_destination(items: TDataItems, table: TTableSchema) -> None: + ... +``` + +### Decorator arguments +* The `batch_size` parameter on the destination decorator defines how many items per function call are batched together and sent as an array. If you set a batch-size of `0`, instead of passing in actual data items, you will receive one call per load job with the path of the file as the items argument. You can then open and process that file in any way you like. +* The `loader_file_format` parameter on the destination decorator defines in which format files are stored in the load package before being sent to the destination function. This can be `jsonl` or `parquet`. +* The `name` parameter on the destination decorator defines the name of the destination that gets created by the destination decorator. +* The `naming_convention` parameter on the destination decorator defines the name of the destination that gets created by the destination decorator. This controls how table and column names are normalized. The default is `direct`, which will keep all names the same. +* The `max_nesting_level` parameter on the destination decorator defines how deep the normalizer will go to normalize complex fields on your data to create subtables. This overwrites any settings on your `source` and is set to zero to not create any nested tables by default. +* The `skip_dlt_columns_and_tables` parameter on the destination decorator defines whether internal tables and columns will be fed into the custom destination function. This is set to `True` by default. + +:::note +Settings above make sure that shape of the data you receive in the destination function is as close as possible to what you see in the data source. + +* The custom destination sets the `max_nesting_level` to 0 by default, which means no sub-tables will be generated during the normalization phase. +* The custom destination also skips all internal tables and columns by default. If you need these, set `skip_dlt_columns_and_tables` to False. +::: + +### Custom destination function +* The `items` parameter on the custom destination function contains the items being sent into the destination function. +* The `table` parameter contains the schema table the current call belongs to, including all table hints and columns. For example, the table name can be accessed with `table["name"]`. +* You can also add config values and secrets to the function arguments, see below! + +## Add configuration, credentials and other secret to the destination function +The destination decorator supports settings and secrets variables. If you, for example, plan to connect to a service that requires an API secret or a login, you can do the following: + +```py +@dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_destination") +def my_destination(items: TDataItems, table: TTableSchema, api_key: dlt.secrets.value) -> None: + ... +``` + +You can then set a config variable in your `.dlt/secrets.toml`: like so: + +```toml +[destination.my_destination] +api_key="" +``` + +Custom destinations follow the same configuration rules as [regular named destinations](../../general-usage/destination.md#configure-a-destination) + +## Use the custom destination in `dlt` pipeline + +There are multiple ways to pass the custom destination function to `dlt` pipeline: +- Directly reference the destination function + + ```py + @dlt.destination(batch_size=10) + def local_destination_func(items: TDataItems, table: TTableSchema) -> None: + ... + + # reference function directly + p = dlt.pipeline("my_pipe", destination=local_destination_func) + ``` + + Like for [regular destinations](../../general-usage/destination.md#pass-explicit-credentials), you are allowed to pass configuration and credentials + explicitly to destination function. + ```py + @dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_destination") + def my_destination(items: TDataItems, table: TTableSchema, api_key: dlt.secrets.value) -> None: + ... + + p = dlt.pipeline("my_pipe", destination=my_destination(api_key=os.getenv("MY_API_KEY"))) + ``` + +- Directly via destination reference. In this case, don't use the decorator for the destination function. + ```py + # file my_destination.py + + from dlt.common.destination import Destination + + # don't use the decorator + def local_destination_func(items: TDataItems, table: TTableSchema) -> None: + ... + + # via destination reference + p = dlt.pipeline( + "my_pipe", + destination=Destination.from_reference( + "destination", destination_callable=local_destination_func + ) + ) + ``` +- Via a fully qualified string to function location (can be used from `config.toml` or ENV vars). The destination function should be located in another file. + ```py + # file my_pipeline.py + + from dlt.common.destination import Destination + + # fully qualified string to function location + p = dlt.pipeline( + "my_pipe", + destination=Destination.from_reference( + "destination", destination_callable="my_destination.local_destination_func" + ) + ) + ``` + +## Adjust batch size and retry policy for atomic loads +The destination keeps a local record of how many `DataItems` were processed, so if you, for example, use the custom destination to push `DataItems` to a remote API, and this +API becomes unavailable during the load resulting in a failed `dlt` pipeline run, you can repeat the run of your pipeline at a later moment and the custom destination will **restart from the whole batch that failed**. We are preventing any data from being lost, but you can still get duplicated data if you committed half of the batch ie. to a database and then failed. +**Keeping the batch atomicity is on you**. For this reason it makes sense to choose a batch size that you can process in one transaction (say one api request or one database transaction) so that if this request or transaction fail repeatedly you can repeat it at the next run without pushing duplicate data to your remote location. For systems that +are not transactional and do not tolerate duplicated data, you can use batch of size 1. + +Destination functions that raise exceptions are retried 5 times before giving up (`load.raise_on_max_retries` config option). If you run the pipeline again, it will resume loading before extracting new data. + +If your exception derives from `DestinationTerminalException`, the whole load job will be marked as failed and not retried again. + +:::caution +If you wipe out the pipeline folder (where job files and destination state are saved) you will not be able to restart from the last failed batch. +However, it is fairly easy to backup and restore the pipeline directory, [see details below](#manage-pipeline-state-for-incremental-loading). +::: + +## Increase or decrease loading parallelism +Calls to the destination function by default will be executed on multiple threads, so you need to make sure you are not using any non-thread-safe nonlocal or global variables from outside your destination function. If you need to have all calls be executed from the same thread, you can set the `workers` [config variable of the load step](../../reference/performance.md#load) to 1. + +:::tip +For performance reasons, we recommend keeping the multithreaded approach and making sure that you, for example, are using threadsafe connection pools to a remote database or queue. +::: + +## Write disposition + +`@dlt.destination` will forward all normalized `DataItems` encountered during a pipeline run to the custom destination function, so there is no notion of "write dispositions". + +## Staging support + +`@dlt.destination` does not support staging files in remote locations before being called at this time. If you need this feature, please let us know. + +## Manage pipeline state for incremental loading +Custom destinations do not have a general mechanism to restore pipeline state. This will impact data sources that rely on the state being kept ie. all incremental resources. +If you wipe the pipeline directory (ie. by deleting a folder or running on AWS lambda / Github Actions where you get a clean runner) the progress of the incremental loading is lost. On the next run you will re-acquire the data from the beginning. + +While we are working on a pluggable state storage you can fix the problem above by: +1. Not wiping the pipeline directory. For example if you run your pipeline on an EC instance periodically, the state will be preserved. +2. By doing a restore/backup of the pipeline directory before/after it runs. This is way easier than it sounds and [here's a script you can reuse](https://gist.github.com/rudolfix/ee6e16d8671f26ac4b9ffc915ad24b6e). + +## What's next + +* Check out our [Custom BigQuery Destination](../../examples/custom_destination_bigquery/) example. +* Need help with building a custom destination? Ask your questions in our [Slack Community](https://dlthub.com/community) technical help channel. diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index db7428dcc9..63b4aecd80 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -7,38 +7,38 @@ keywords: [duckdb, destination, data warehouse] # DuckDB ## Install dlt with DuckDB -**To install the DLT library with DuckDB dependencies:** -``` +**To install the DLT library with DuckDB dependencies, run:** +```sh pip install dlt[duckdb] ``` ## Setup Guide -**1. Initialize a project with a pipeline that loads to DuckDB by running** -``` +**1. Initialize a project with a pipeline that loads to DuckDB by running:** +```sh dlt init chess duckdb ``` -**2. Install the necessary dependencies for DuckDB by running** -``` +**2. Install the necessary dependencies for DuckDB by running:** +```sh pip install -r requirements.txt ``` -**3. Run the pipeline** -``` +**3. Run the pipeline:** +```sh python3 chess_pipeline.py ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. ## Data loading -`dlt` will load data using large INSERT VALUES statements by default. Loading is multithreaded (20 threads by default). If you are ok with installing `pyarrow` we suggest to switch to `parquet` as file format. Loading is faster (and also multithreaded). +`dlt` will load data using large INSERT VALUES statements by default. Loading is multithreaded (20 threads by default). If you are okay with installing `pyarrow`, we suggest switching to `parquet` as the file format. Loading is faster (and also multithreaded). ### Names normalization -`dlt` uses standard **snake_case** naming convention to keep identical table and column identifiers across all destinations. If you want to use **duckdb** wide range of characters (ie. emojis) for table and column names, you can switch to **duck_case** naming convention which accepts almost any string as an identifier: +`dlt` uses the standard **snake_case** naming convention to keep identical table and column identifiers across all destinations. If you want to use the **duckdb** wide range of characters (i.e., emojis) for table and column names, you can switch to the **duck_case** naming convention, which accepts almost any string as an identifier: * `\n` `\r` and `" are translated to `_` -* multiple `_` are translated to single `_` +* multiple `_` are translated to a single `_` Switch the naming convention using `config.toml`: ```toml @@ -46,34 +46,34 @@ Switch the naming convention using `config.toml`: naming="duck_case" ``` -or via env variable `SCHEMA__NAMING` or directly in code: -```python +or via the env variable `SCHEMA__NAMING` or directly in the code: +```py dlt.config["schema.naming"] = "duck_case" ``` :::caution -**duckdb** identifiers are **case insensitive** but display names preserve case. This may create name clashes if for example you load json with -`{"Column": 1, "column": 2}` will map data to a single column. +**duckdb** identifiers are **case insensitive** but display names preserve case. This may create name clashes if, for example, you load JSON with +`{"Column": 1, "column": 2}` as it will map data to a single column. ::: ## Supported file formats -You can configure the following file formats to load data to duckdb +You can configure the following file formats to load data to duckdb: * [insert-values](../file-formats/insert-format.md) is used by default * [parquet](../file-formats/parquet.md) is supported :::note -`duckdb` cannot COPY many parquet files to a single table from multiple threads. In this situation `dlt` serializes the loads. Still - that may be faster than INSERT +`duckdb` cannot COPY many parquet files to a single table from multiple threads. In this situation, `dlt` serializes the loads. Still, that may be faster than INSERT. ::: -* [jsonl](../file-formats/jsonl.md) **is supported but does not work if JSON fields are optional. the missing keys fail the COPY instead of being interpreted as NULL** +* [jsonl](../file-formats/jsonl.md) **is supported but does not work if JSON fields are optional. The missing keys fail the COPY instead of being interpreted as NULL.** ## Supported column hints -`duckdb` may create unique indexes for all columns with `unique` hints but this behavior **is disabled by default** because it slows the loading down significantly. +`duckdb` may create unique indexes for all columns with `unique` hints, but this behavior **is disabled by default** because it slows the loading down significantly. ## Destination Configuration -By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:` which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. +By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. The `duckdb` credentials do not require any secret values. You are free to pass the configuration explicitly via the `credentials` parameter to `dlt.pipeline` or `pipeline.run` methods. For example: -```python +```py # will load data to files/data.db database file p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='chess_data', full_refresh=False, credentials="files/data.db") @@ -82,23 +82,23 @@ p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='ches ``` The destination accepts a `duckdb` connection instance via `credentials`, so you can also open a database connection yourself and pass it to `dlt` to use. `:memory:` databases are supported. -```python +```py import duckdb db = duckdb.connect() p = dlt.pipeline(pipeline_name='chess', destination='duckdb', dataset_name='chess_data', full_refresh=False, credentials=db) ``` -This destination accepts database connection strings in format used by [duckdb-engine](https://github.com/Mause/duckdb_engine#configuration). +This destination accepts database connection strings in the format used by [duckdb-engine](https://github.com/Mause/duckdb_engine#configuration). -You can configure a DuckDB destination with [secret / config values](../../general-usage/credentials) (e.g. using a `secrets.toml` file) +You can configure a DuckDB destination with [secret / config values](../../general-usage/credentials) (e.g., using a `secrets.toml` file) ```toml -destination.duckdb.credentials=duckdb:///_storage/test_quack.duckdb +destination.duckdb.credentials="duckdb:///_storage/test_quack.duckdb" ``` -**duckdb://** url above creates a **relative** path to `_storage/test_quack.duckdb`. To define **absolute** path you need to specify four slashes ie. `duckdb:////_storage/test_quack.duckdb`. +The **duckdb://** URL above creates a **relative** path to `_storage/test_quack.duckdb`. To define an **absolute** path, you need to specify four slashes, i.e., `duckdb:////_storage/test_quack.duckdb`. A few special connection strings are supported: -* **:pipeline:** creates the database in the working directory of the pipeline with name `quack.duckdb`. -* **:memory:** creates in memory database. This may be useful for testing. +* **:pipeline:** creates the database in the working directory of the pipeline with the name `quack.duckdb`. +* **:memory:** creates an in-memory database. This may be useful for testing. ### Additional configuration @@ -109,10 +109,10 @@ create_indexes=true ``` ### dbt support -This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb) which is a community supported package. The `duckdb` database is shared with `dbt`. In rare cases you may see information that binary database format does not match the database format expected by `dbt-duckdb`. You may avoid that by updating the `duckdb` package in your `dlt` project with `pip install -U`. +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb), which is a community-supported package. The `duckdb` database is shared with `dbt`. In rare cases, you may see information that the binary database format does not match the database format expected by `dbt-duckdb`. You can avoid that by updating the `duckdb` package in your `dlt` project with `pip install -U`. ### Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). ## Additional Setup guides @@ -124,4 +124,4 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci - [Load data from Chess.com to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-duckdb) - [Load data from HubSpot to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-duckdb) - [Load data from GitHub to DuckDB in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-duckdb) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index ba323b3d7f..dbd54253b3 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -7,7 +7,7 @@ Its primary role is to be used as a staging for other destinations, but you can ## Install dlt with filesystem **To install the DLT library with filesystem dependencies:** -``` +```sh pip install dlt[filesystem] ``` @@ -29,7 +29,7 @@ so pip does not fail on backtracking. ### 1. Initialise the dlt project Let's start by initialising a new dlt project as follows: - ```bash + ```sh dlt init chess filesystem ``` > 💡 This command will initialise your pipeline with chess as the source and the AWS S3 filesystem as the destination. @@ -38,7 +38,7 @@ Let's start by initialising a new dlt project as follows: #### AWS S3 The command above creates sample `secrets.toml` and requirements file for AWS S3 bucket. You can install those dependencies by running: -``` +```sh pip install -r requirements.txt ``` @@ -71,7 +71,7 @@ You need to create a S3 bucket and a user who can access that bucket. `dlt` is n 1. You can create the S3 bucket in the AWS console by clicking on "Create Bucket" in S3 and assigning the appropriate name and permissions to the bucket. 2. Once the bucket is created, you'll have the bucket URL. For example, If the bucket name is `dlt-ci-test-bucket`, then the bucket URL will be: - ``` + ```text s3://dlt-ci-test-bucket ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/index.md b/docs/website/docs/dlt-ecosystem/destinations/index.md index 5d26c0f138..2c24d14312 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/index.md +++ b/docs/website/docs/dlt-ecosystem/destinations/index.md @@ -5,11 +5,11 @@ keywords: ['destinations'] --- import DocCardList from '@theme/DocCardList'; -Pick one of our high quality destinations and load your data to a local database, warehouse or a data lake. Append, replace or merge your data. Apply performance hints like partitions, clusters or indexes. Load directly or via staging. Each of our destinations goes through few hundred automated tests every day. +Pick one of our high-quality destinations and load your data into a local database, warehouse, or data lake. Append, replace, or merge your data. Apply performance hints like partitions, clusters, or indexes. Load directly or via staging. Each of our destinations undergoes several hundred automated tests every day. -* Destination or feature missing? [Join our Slack community](https://dlthub.com/community) and ask for it -* Need more info? [Join our Slack community](https://dlthub.com/community) and ask in the tech help channel or [Talk to an engineer](https://calendar.app.google/kiLhuMsWKpZUpfho6) +* Is a destination or feature missing? [Join our Slack community](https://dlthub.com/community) and ask for it. +* Need more info? [Join our Slack community](https://dlthub.com/community) and ask in the tech help channel or [Talk to an engineer](https://calendar.app.google/kiLhuMsWKpZUpfho6). -Otherwise pick a destination below: +Otherwise, pick a destination below: diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md index b002286bcf..de11ed5772 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md +++ b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md @@ -5,36 +5,36 @@ keywords: [MotherDuck, duckdb, destination, data warehouse] --- # MotherDuck -> 🧪 MotherDuck is still invitation only and intensively tested. Please see the limitations / problems at the end. +> 🧪 MotherDuck is still invitation-only and is being intensively tested. Please see the limitations/problems at the end. ## Install dlt with MotherDuck **To install the DLT library with MotherDuck dependencies:** -``` +```sh pip install dlt[motherduck] ``` :::tip -Decrease the number of load workers to 3-5 depending on the quality of your internet connection if you see a lot of retries in your logs with various timeout, add the following to your `config.toml`: +If you see a lot of retries in your logs with various timeouts, decrease the number of load workers to 3-5 depending on the quality of your internet connection. Add the following to your `config.toml`: ```toml [load] workers=3 ``` -or export **LOAD__WORKERS=3** env variable. See more in [performance](../../reference/performance.md) +or export the **LOAD__WORKERS=3** env variable. See more in [performance](../../reference/performance.md) ::: ## Setup Guide **1. Initialize a project with a pipeline that loads to MotherDuck by running** -``` +```sh dlt init chess motherduck ``` **2. Install the necessary dependencies for MotherDuck by running** -``` +```sh pip install -r requirements.txt ``` -This will install dlt with **motherduck** extra which contains **duckdb** and **pyarrow** dependencies +This will install dlt with the **motherduck** extra which contains **duckdb** and **pyarrow** dependencies. **3. Add your MotherDuck token to `.dlt/secrets.toml`** ```toml @@ -42,63 +42,61 @@ This will install dlt with **motherduck** extra which contains **duckdb** and ** database = "dlt_data_3" password = "" ``` -Paste your **service token** into password. The `database` field is optional but we recommend to set it. MotherDuck will create this database (in this case `dlt_data_3`) for you. +Paste your **service token** into the password field. The `database` field is optional, but we recommend setting it. MotherDuck will create this database (in this case `dlt_data_3`) for you. -Alternatively you can use the connection string syntax +Alternatively, you can use the connection string syntax. ```toml [destination] motherduck.credentials="md:///dlt_data_3?token=" ``` -**3. Run the pipeline** -``` +**4. Run the pipeline** +```sh python3 chess_pipeline.py ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. ## Data loading -By default **parquet** files and `COPY` command is used to move files to remote duckdb database. All write dispositions are supported. +By default, Parquet files and the `COPY` command are used to move files to the remote duckdb database. All write dispositions are supported. -**INSERT** format is also supported and will execute a large INSERT queries directly into the remote database. This is way slower and may exceed maximum query size - so not advised. +The **INSERT** format is also supported and will execute large INSERT queries directly into the remote database. This method is significantly slower and may exceed the maximum query size, so it is not advised. ## dbt support -This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb) which is a community supported package. `dbt` version >= 1.5 is required (which is current `dlt` default.) +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-duckdb](https://github.com/jwills/dbt-duckdb), which is a community-supported package. `dbt` version >= 1.5 is required (which is the current `dlt` default.) ## Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). ## Automated tests -Each destination must pass few hundred automatic tests. MotherDuck is passing those tests (except the transactions OFC). However we encountered issues with ATTACH timeouts when connecting which makes running such number of tests unstable. Tests on CI are disabled. +Each destination must pass a few hundred automatic tests. MotherDuck is passing these tests (except for the transactions, of course). However, we have encountered issues with ATTACH timeouts when connecting, which makes running such a number of tests unstable. Tests on CI are disabled. ## Troubleshooting / limitations ### I see a lot of errors in the log like DEADLINE_EXCEEDED or Connection timed out -Motherduck is very sensitive to quality of the internet connection and **number of workers used to load data**. Decrease the number of workers and make sure your internet connection really works. We could not find any way to increase those timeouts yet. - +MotherDuck is very sensitive to the quality of the internet connection and the **number of workers used to load data**. Decrease the number of workers and ensure your internet connection is stable. We have not found any way to increase these timeouts yet. ### MotherDuck does not support transactions. -Do not use `begin`, `commit` and `rollback` on `dlt` **sql_client** or on duckdb dbapi connection. It has no effect for DML statements (they are autocommit). It is confusing the query engine for DDL (tables not found etc.). -If your connection if of poor quality and you get a time out when executing DML query it may happen that your transaction got executed, - +Do not use `begin`, `commit`, and `rollback` on `dlt` **sql_client** or on the duckdb dbapi connection. It has no effect on DML statements (they are autocommit). It confuses the query engine for DDL (tables not found, etc.). +If your connection is of poor quality and you get a timeout when executing a DML query, it may happen that your transaction got executed. ### I see some exception with home_dir missing when opening `md:` connection. -Some internal component (HTTPS) requires **HOME** env variable to be present. Export such variable to the command line. Here is what we do in our tests: -```python +Some internal component (HTTPS) requires the **HOME** env variable to be present. Export such a variable to the command line. Here is what we do in our tests: +```py os.environ["HOME"] = "/tmp" ``` -before opening connection +before opening the connection. ### I see some watchdog timeouts. We also see them. -``` +```text 'ATTACH_DATABASE': keepalive watchdog timeout ``` -My observation is that if you write a lot of data into the database then close the connection and then open it again to write, there's a chance of such timeout. Possible **WAL** file is being written to the remote duckdb database. +Our observation is that if you write a lot of data into the database, then close the connection and then open it again to write, there's a chance of such a timeout. A possible **WAL** file is being written to the remote duckdb database. ### Invalid Input Error: Initialization function "motherduck_init" from file Use `duckdb 0.8.1` or above. - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 9d216a52a3..fc3eede075 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -7,8 +7,8 @@ keywords: [mssql, sqlserver, destination, data warehouse] # Microsoft SQL Server ## Install dlt with MS SQL -**To install the DLT library with MS SQL dependencies:** -``` +**To install the DLT library with MS SQL dependencies, use:** +```sh pip install dlt[mssql] ``` @@ -16,35 +16,35 @@ pip install dlt[mssql] ### Prerequisites -_Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. -This can't be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). +The _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. +This cannot be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). Supported driver versions: * `ODBC Driver 18 for SQL Server` * `ODBC Driver 17 for SQL Server` -You can [configure driver name](#additional-destination-options) explicitly as well. +You can also [configure the driver name](#additional-destination-options) explicitly. ### Create a pipeline -**1. Initalize a project with a pipeline that loads to MS SQL by running** -``` +**1. Initialize a project with a pipeline that loads to MS SQL by running:** +```sh dlt init chess mssql ``` -**2. Install the necessary dependencies for MS SQL by running** -``` +**2. Install the necessary dependencies for MS SQL by running:** +```sh pip install -r requirements.txt ``` or run: -``` +```sh pip install dlt[mssql] ``` -This will install dlt with **mssql** extra which contains all the dependencies required by the SQL server client. +This will install `dlt` with the `mssql` extra, which contains all the dependencies required by the SQL server client. **3. Enter your credentials into `.dlt/secrets.toml`.** -Example, replace with your database connection info: +For example, replace with your database connection info: ```toml [destination.mssql.credentials] database = "dlt_data" @@ -61,34 +61,34 @@ You can also pass a SQLAlchemy-like database connection: destination.mssql.credentials="mssql://loader:@loader.database.windows.net/dlt_data?connect_timeout=15" ``` -To pass credentials directly you can use `credentials` argument passed to `dlt.pipeline` or `pipeline.run` methods. -```python +To pass credentials directly, you can use the `credentials` argument passed to `dlt.pipeline` or `pipeline.run` methods. +```py pipeline = dlt.pipeline(pipeline_name='chess', destination='postgres', dataset_name='chess_data', credentials="mssql://loader:@loader.database.windows.net/dlt_data?connect_timeout=15") ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized` the destination tables will be dropped and +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and recreated with an `ALTER SCHEMA ... TRANSFER`. The operation is atomic: mssql supports DDL transactions. ## Data loading -Data is loaded via INSERT statements by default. MSSQL has a limit of 1000 rows per INSERT and this is what we use. +Data is loaded via INSERT statements by default. MSSQL has a limit of 1000 rows per INSERT, and this is what we use. ## Supported file formats * [insert-values](../file-formats/insert-format.md) is used by default ## Supported column hints -**mssql** will create unique indexes for all columns with `unique` hints. This behavior **may be disabled** +**mssql** will create unique indexes for all columns with `unique` hints. This behavior **may be disabled**. ## Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). ## Data types -MS SQL does not support JSON columns, so JSON objects are stored as strings in `nvarchar` column. +MS SQL does not support JSON columns, so JSON objects are stored as strings in `nvarchar` columns. ## Additional destination options -**mssql** destination **does not** creates UNIQUE indexes by default on columns with `unique` hint (ie. `_dlt_id`). To enable this behavior +The **mssql** destination **does not** create UNIQUE indexes by default on columns with the `unique` hint (i.e., `_dlt_id`). To enable this behavior: ```toml [destination.mssql] create_indexes=true @@ -108,7 +108,7 @@ destination.mssql.credentials="mssql://loader:@loader.database.windows ``` ### dbt support -No dbt support yet +No dbt support yet. ## Additional Setup guides @@ -120,4 +120,4 @@ No dbt support yet - [Load data from GitHub to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-mssql) - [Load data from Notion to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-mssql) - [Load data from HubSpot to Microsoft SQL Server in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-mssql) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index cd0ea08929..ddf4aae9f8 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -7,47 +7,47 @@ keywords: [postgres, destination, data warehouse] # Postgres ## Install dlt with PostgreSQL -**To install the DLT library with PostgreSQL dependencies:** -``` +**To install the DLT library with PostgreSQL dependencies, run:** +```sh pip install dlt[postgres] ``` ## Setup Guide -**1. Initialize a project with a pipeline that loads to Postgres by running** -``` +**1. Initialize a project with a pipeline that loads to Postgres by running:** +```sh dlt init chess postgres ``` -**2. Install the necessary dependencies for Postgres by running** -``` +**2. Install the necessary dependencies for Postgres by running:** +```sh pip install -r requirements.txt ``` -This will install dlt with **postgres** extra which contains `psycopg2` client. +This will install dlt with the `postgres` extra, which contains the `psycopg2` client. -**3. Create a new database after setting up a Postgres instance and `psql` / query editor by running** -``` +**3. After setting up a Postgres instance and `psql` / query editor, create a new database by running:** +```sql CREATE DATABASE dlt_data; ``` -Add `dlt_data` database to `.dlt/secrets.toml`. +Add the `dlt_data` database to `.dlt/secrets.toml`. -**4. Create a new user by running** -``` +**4. Create a new user by running:** +```sql CREATE USER loader WITH PASSWORD ''; ``` -Add `loader` user and `` password to `.dlt/secrets.toml`. +Add the `loader` user and `` password to `.dlt/secrets.toml`. -**5. Give the `loader` user owner permissions by running** -``` +**5. Give the `loader` user owner permissions by running:** +```sql ALTER DATABASE dlt_data OWNER TO loader; ``` -It is possible to set more restrictive permissions (e.g. give user access to a specific schema). +You can set more restrictive permissions (e.g., give user access to a specific schema). **6. Enter your credentials into `.dlt/secrets.toml`.** -It should now look like +It should now look like this: ```toml [destination.postgres.credentials] @@ -59,33 +59,33 @@ port = 5432 connect_timeout = 15 ``` -You can also pass a database connection string similar to the one used by `psycopg2` library or [SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql). Credentials above will look like this: +You can also pass a database connection string similar to the one used by the `psycopg2` library or [SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql). The credentials above will look like this: ```toml # keep it at the top of your toml file! before any section starts destination.postgres.credentials="postgresql://loader:@localhost/dlt_data?connect_timeout=15" ``` -To pass credentials directly you can use `credentials` argument passed to `dlt.pipeline` or `pipeline.run` methods. -```python +To pass credentials directly, you can use the `credentials` argument passed to the `dlt.pipeline` or `pipeline.run` methods. +```py pipeline = dlt.pipeline(pipeline_name='chess', destination='postgres', dataset_name='chess_data', credentials="postgresql://loader:@localhost/dlt_data") ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized` the destination tables will be dropped and replaced by the staging tables. +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and replaced by the staging tables. ## Data loading `dlt` will load data using large INSERT VALUES statements by default. Loading is multithreaded (20 threads by default). ## Supported file formats -* [insert-values](../file-formats/insert-format.md) is used by default +* [insert-values](../file-formats/insert-format.md) is used by default. ## Supported column hints -`postgres` will create unique indexes for all columns with `unique` hints. This behavior **may be disabled** +`postgres` will create unique indexes for all columns with `unique` hints. This behavior **may be disabled**. ## Additional destination options -Postgres destination creates UNIQUE indexes by default on columns with `unique` hint (ie. `_dlt_id`). To disable this behavior +The Postgres destination creates UNIQUE indexes by default on columns with the `unique` hint (i.e., `_dlt_id`). To disable this behavior: ```toml [destination.postgres] create_indexes=false @@ -95,16 +95,16 @@ create_indexes=false This destination [integrates with dbt](../transformations/dbt/dbt.md) via dbt-postgres. ### Syncing of `dlt` state -This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). -## Additional Setup guides - -- [Load data from HubSpot to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-postgres) -- [Load data from GitHub to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-postgres) -- [Load data from Chess.com to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-postgres) -- [Load data from Notion to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-postgres) -- [Load data from Google Analytics to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-postgres) -- [Load data from Google Sheets to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-postgres) -- [Load data from Stripe to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-postgres) - \ No newline at end of file +## Additional Setup Guides + +- [Load data from HubSpot to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-postgres) +- [Load data from GitHub to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-postgres) +- [Load data from Chess.com to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-postgres) +- [Load data from Notion to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/notion/load-data-with-python-from-notion-to-postgres) +- [Load data from Google Analytics to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-postgres) +- [Load data from Google Sheets to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-postgres) +- [Load data from Stripe to PostgreSQL in Python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-postgres) + diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index 04b5cac19b..40d85a43a5 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -13,7 +13,7 @@ This destination helps you load data into Qdrant from [dlt resources](../../gene 1. To use Qdrant as a destination, make sure `dlt` is installed with the `qdrant` extra: -```bash +```sh pip install dlt[qdrant] ``` @@ -31,7 +31,7 @@ If no configuration options are provided, the default fallback will be `http://l 3. Define the source of the data. For starters, let's load some data from a simple data structure: -```python +```py import dlt from dlt.destinations.adapters import qdrant_adapter @@ -53,7 +53,7 @@ movies = [ 4. Define the pipeline: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="qdrant", @@ -63,7 +63,7 @@ pipeline = dlt.pipeline( 5. Run the pipeline: -```python +```py info = pipeline.run( qdrant_adapter( movies, @@ -74,7 +74,7 @@ info = pipeline.run( 6. Check the results: -```python +```py print(info) ``` @@ -86,20 +86,20 @@ To use vector search after the data has been loaded, you must specify which fiel The `qdrant_adapter` is a helper function that configures the resource for the Qdrant destination: -```python +```py qdrant_adapter(data, embed) ``` It accepts the following arguments: -- `data`: a dlt resource object or a Python data structure (e.g. a list of dictionaries). +- `data`: a dlt resource object or a Python data structure (e.g., a list of dictionaries). - `embed`: a name of the field or a list of names to generate embeddings for. Returns: [DLT resource](../../general-usage/resource.md) object that you can pass to the `pipeline.run()`. Example: -```python +```py qdrant_adapter( resource, embed=["title", "description"], @@ -122,7 +122,7 @@ The [replace](../../general-usage/full-loading.md) disposition replaces the data In the movie example from the [setup guide](#setup-guide), we can use the `replace` disposition to reload the data every time we run the pipeline: -```python +```py info = pipeline.run( qdrant_adapter( movies, @@ -135,9 +135,9 @@ info = pipeline.run( ### Merge The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination. -For `merge` disposition, you would need to specify a `primary_key` for the resource: +For the `merge` disposition, you need to specify a `primary_key` for the resource: -```python +```py info = pipeline.run( qdrant_adapter( movies, @@ -166,11 +166,11 @@ Qdrant uses collections to categorize and identify data. To avoid potential nami For example, if you have a dataset named `movies_dataset` and a table named `actors`, the Qdrant collection name would be `movies_dataset_actors` (the default separator is an underscore). -However, if you prefer to have class names without the dataset prefix, skip `dataset_name` argument. +However, if you prefer to have class names without the dataset prefix, skip the `dataset_name` argument. For example: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="qdrant", @@ -185,7 +185,7 @@ pipeline = dlt.pipeline( - `upload_batch_size`: (int) The batch size for data uploads. The default value is 64. -- `upload_parallelism`: (int) The maximal number of concurrent threads to run data uploads. The default value is 1. +- `upload_parallelism`: (int) The maximum number of concurrent threads to run data uploads. The default value is 1. - `upload_max_retries`: (int) The number of retries to upload data in case of failure. The default value is 3. @@ -222,4 +222,4 @@ You can find the setup instructions to run Qdrant [here](https://qdrant.tech/doc Qdrant destination supports syncing of the `dlt` state. - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index cb220a31fc..7b56377f3b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -8,7 +8,7 @@ keywords: [redshift, destination, data warehouse] ## Install dlt with Redshift **To install the DLT library with Redshift dependencies:** -``` +```sh pip install dlt[redshift] ``` @@ -17,19 +17,19 @@ pip install dlt[redshift] Let's start by initializing a new dlt project as follows: -```bash +```sh dlt init chess redshift ``` > 💡 This command will initialize your pipeline with chess as the source and Redshift as the destination. The above command generates several files and directories, including `.dlt/secrets.toml` and a requirements file for Redshift. You can install the necessary dependencies specified in the requirements file by executing it as follows: -```bash +```sh pip install -r requirements.txt ``` or with `pip install dlt[redshift]`, which installs the `dlt` library and the necessary dependencies for working with Amazon Redshift as a destination. ### 2. Setup Redshift cluster -To load data into Redshift, it is necessary to create a Redshift cluster and enable access to your IP address through the VPC inbound rules associated with the cluster. While we recommend asking our GPT-4 assistant for details, we have provided a general outline of the process below: +To load data into Redshift, you need to create a Redshift cluster and enable access to your IP address through the VPC inbound rules associated with the cluster. While we recommend asking our GPT-4 assistant for details, we have provided a general outline of the process below: 1. You can use an existing cluster or create a new one. 2. To create a new cluster, navigate to the 'Provisioned Cluster Dashboard' and click 'Create Cluster'. @@ -52,16 +52,16 @@ To load data into Redshift, it is necessary to create a Redshift cluster and ena 2. The "host" is derived from the cluster endpoint specified in the “General Configuration.” For example: - ```bash + ```sh # If the endpoint is: redshift-cluster-1.cv3cmsy7t4il.us-east-1.redshift.amazonaws.com:5439/your_database_name # Then the host is: redshift-cluster-1.cv3cmsy7t4il.us-east-1.redshift.amazonaws.com ``` -3. The `connect_timeout` is the number of minutes the pipeline will wait before the timeout. +3. The `connect_timeout` is the number of minutes the pipeline will wait before timing out. -You can also pass a database connection string similar to the one used by `psycopg2` library or [SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql). Credentials above will look like this: +You can also pass a database connection string similar to the one used by the `psycopg2` library or [SQLAlchemy](https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql). The credentials above will look like this: ```toml # keep it at the top of your toml file! before any section starts destination.redshift.credentials="redshift://loader:@localhost/dlt_data?connect_timeout=15" @@ -82,25 +82,24 @@ When staging is enabled: > ❗ **Redshift cannot load `TIME` columns from `json` or `parquet` files**. `dlt` will fail such jobs permanently. Switch to direct `insert_values` to load time columns. -> ❗ **Redshift cannot detect compression type from `json` files**. `dlt` assumes that `jsonl` files are gzip compressed which is the default. - -> ❗ **Redshift loads `complex` types as strings into SUPER with `parquet`**. Use `jsonl` format to store JSON in SUPER natively or transform your SUPER columns with `PARSE_JSON``. +> ❗ **Redshift cannot detect compression type from `json` files**. `dlt` assumes that `jsonl` files are gzip compressed, which is the default. +> ❗ **Redshift loads `complex` types as strings into SUPER with `parquet`**. Use `jsonl` format to store JSON in SUPER natively or transform your SUPER columns with `PARSE_JSON`. ## Supported column hints Amazon Redshift supports the following column hints: -- `cluster` - hint is a Redshift term for table distribution. Applying it to a column makes it the "DISTKEY," affecting query and join performance. Check the following [documentation](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-best-dist-key.html) for more info. -- `sort` - creates SORTKEY to order rows on disk physically. It is used to improve a query and join speed in Redshift, please read the [sort key docs](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-sort-key.html) to learn more. +- `cluster` - This hint is a Redshift term for table distribution. Applying it to a column makes it the "DISTKEY," affecting query and join performance. Check the following [documentation](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-best-dist-key.html) for more info. +- `sort` - This hint creates a SORTKEY to order rows on disk physically. It is used to improve query and join speed in Redshift. Please read the [sort key docs](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-sort-key.html) to learn more. ## Staging support -Redshift supports s3 as a file staging destination. dlt will upload files in the parquet format to s3 and ask redshift to copy their data directly into the db. Please refere to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the aws credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. +Redshift supports s3 as a file staging destination. dlt will upload files in the parquet format to s3 and ask Redshift to copy their data directly into the db. Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the AWS credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. -### Authentication iam Role +### Authentication IAM Role -If you would like to load from s3 without forwarding the aws staging credentials but authorize with an iam role connected to Redshift, follow the [Redshift documentation](https://docs.aws.amazon.com/redshift/latest/mgmt/authorizing-redshift-service.html) to create a role with access to s3 linked to your redshift cluster and change your destination settings to use the iam role: +If you would like to load from s3 without forwarding the AWS staging credentials but authorize with an IAM role connected to Redshift, follow the [Redshift documentation](https://docs.aws.amazon.com/redshift/latest/mgmt/authorizing-redshift-service.html) to create a role with access to s3 linked to your Redshift cluster and change your destination settings to use the IAM role: ```toml [destination] @@ -109,7 +108,7 @@ staging_iam_role="arn:aws:iam::..." ### Redshift/S3 staging example code -```python +```py # Create a dlt pipeline that will load # chess player data to the redshift destination # via staging on s3 @@ -143,4 +142,4 @@ Supported loader file formats for Redshift are `sql` and `insert_values` (defaul - [Load data from GitHub to Redshift in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-redshift) - [Load data from Stripe to Redshift in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-redshift) - [Load data from Google Sheets to Redshift in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-redshift) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 34efb0df39..a65eaec267 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -7,30 +7,30 @@ keywords: [Snowflake, destination, data warehouse] # Snowflake ## Install dlt with Snowflake -**To install the DLT library with Snowflake dependencies:** -``` +**To install the DLT library with Snowflake dependencies, run:** +```sh pip install dlt[snowflake] ``` ## Setup Guide -**1. Initialize a project with a pipeline that loads to snowflake by running** -``` +**1. Initialize a project with a pipeline that loads to Snowflake by running:** +```sh dlt init chess snowflake ``` -**2. Install the necessary dependencies for snowflake by running** -``` +**2. Install the necessary dependencies for Snowflake by running:** +```sh pip install -r requirements.txt ``` -This will install dlt with **snowflake** extra which contains Snowflake Python dbapi client. +This will install `dlt` with the `snowflake` extra, which contains the Snowflake Python dbapi client. -**3. Create a new database, user and give dlt access** +**3. Create a new database, user, and give dlt access.** Read the next chapter below. **4. Enter your credentials into `.dlt/secrets.toml`.** -It should now look like +It should now look like this: ```toml [destination.snowflake.credentials] database = "dlt_data" @@ -40,14 +40,13 @@ host = "kgiotue-wn98412" warehouse = "COMPUTE_WH" role = "DLT_LOADER_ROLE" ``` -In case of snowflake **host** is your [Account Identifier](https://docs.snowflake.com/en/user-guide/admin-account-identifier). You can get in **Admin**/**Accounts** by copying account url: -https://kgiotue-wn98412.snowflakecomputing.com and extracting the host name (**kgiotue-wn98412**) +In the case of Snowflake, the **host** is your [Account Identifier](https://docs.snowflake.com/en/user-guide/admin-account-identifier). You can get it in **Admin**/**Accounts** by copying the account URL: https://kgiotue-wn98412.snowflakecomputing.com and extracting the host name (**kgiotue-wn98412**). -The **warehouse** and **role** are optional if you assign defaults to your user. In the example below we do not do that, so we set them explicitly. +The **warehouse** and **role** are optional if you assign defaults to your user. In the example below, we do not do that, so we set them explicitly. ### Setup the database user and permissions -Instructions below assume that you use the default account setup that you get after creating Snowflake account. You should have default warehouse named **COMPUTE_WH** and snowflake account. Below we create a new database, user and assign permissions. The permissions are very generous. A more experienced user can easily reduce `dlt` permissions to just one schema in the database. +The instructions below assume that you use the default account setup that you get after creating a Snowflake account. You should have a default warehouse named **COMPUTE_WH** and a Snowflake account. Below, we create a new database, user, and assign permissions. The permissions are very generous. A more experienced user can easily reduce `dlt` permissions to just one schema in the database. ```sql --create database with standard settings CREATE DATABASE dlt_data; @@ -67,17 +66,17 @@ GRANT ALL PRIVILEGES ON FUTURE SCHEMAS IN DATABASE dlt_data TO DLT_LOADER_ROLE; GRANT ALL PRIVILEGES ON FUTURE TABLES IN DATABASE dlt_data TO DLT_LOADER_ROLE; ``` -Now you can use the user named `LOADER` to access database `DLT_DATA` and log in with specified password. +Now you can use the user named `LOADER` to access the database `DLT_DATA` and log in with the specified password. You can also decrease the suspend time for your warehouse to 1 minute (**Admin**/**Warehouses** in Snowflake UI) ### Authentication types -Snowflake destination accepts three authentication types +Snowflake destination accepts three authentication types: - password authentication - [key pair authentication](https://docs.snowflake.com/en/user-guide/key-pair-auth) - external authentication -The **password authentication** is not any different from other databases like Postgres or Redshift. `dlt` follows the same syntax as [SQLAlchemy dialect](https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#required-parameters). +The **password authentication** is not any different from other databases like Postgres or Redshift. `dlt` follows the same syntax as the [SQLAlchemy dialect](https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#required-parameters). You can also pass credentials as a database connection string. For example: ```toml @@ -85,7 +84,7 @@ You can also pass credentials as a database connection string. For example: destination.snowflake.credentials="snowflake://loader:@kgiotue-wn98412/dlt_data?warehouse=COMPUTE_WH&role=DLT_LOADER_ROLE" ``` -In **key pair authentication** you replace password with a private key string that should be in Base64-encoded DER format ([DBT also recommends](https://docs.getdbt.com/docs/core/connect-data-platform/snowflake-setup#key-pair-authentication) base64-encoded private keys for Snowflake connections). The private key may also be encrypted. In that case you must provide a passphrase alongside with the private key. +In **key pair authentication**, you replace the password with a private key string that should be in Base64-encoded DER format ([DBT also recommends](https://docs.getdbt.com/docs/core/connect-data-platform/snowflake-setup#key-pair-authentication) base64-encoded private keys for Snowflake connections). The private key may also be encrypted. In that case, you must provide a passphrase alongside the private key. ```toml [destination.snowflake.credentials] database = "dlt_data" @@ -96,13 +95,13 @@ private_key_passphrase="passphrase" ``` > You can easily get the base64-encoded value of your private key by running `base64 -i .pem` in your terminal -If you pass a passphrase in the connection string, please url encode it. +If you pass a passphrase in the connection string, please URL encode it. ```toml # keep it at the top of your toml file! before any section starts destination.snowflake.credentials="snowflake://loader:@kgiotue-wn98412/dlt_data?private_key=&private_key_passphrase=" ``` -In **external authentication** you can use oauth provider like Okta or external browser to authenticate. You pass your authenticator and refresh token as below: +In **external authentication**, you can use an OAuth provider like Okta or an external browser to authenticate. You pass your authenticator and refresh token as below: ```toml [destination.snowflake.credentials] database = "dlt_data" @@ -110,17 +109,17 @@ username = "loader" authenticator="..." token="..." ``` -or in connection string as query parameters. +or in the connection string as query parameters. Refer to Snowflake [OAuth](https://docs.snowflake.com/en/user-guide/oauth-intro) for more details. ## Write disposition -All write dispositions are supported +All write dispositions are supported. -If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized` the destination tables will be dropped and +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. ## Data loading -The data is loaded using internal Snowflake stage. We use `PUT` command and per-table built-in stages by default. Stage files are immediately removed (if not specified otherwise). +The data is loaded using an internal Snowflake stage. We use the `PUT` command and per-table built-in stages by default. Stage files are immediately removed (if not specified otherwise). ## Supported file formats * [insert-values](../file-formats/insert-format.md) is used by default @@ -131,47 +130,47 @@ When staging is enabled: * [jsonl](../file-formats/jsonl.md) is used by default * [parquet](../file-formats/parquet.md) is supported -> ❗ When loading from `parquet`, Snowflake will store `complex` types (JSON) in `VARIANT` as string. Use `jsonl` format instead or use `PARSE_JSON` to update the `VARIANT`` field after loading. +> ❗ When loading from `parquet`, Snowflake will store `complex` types (JSON) in `VARIANT` as a string. Use the `jsonl` format instead or use `PARSE_JSON` to update the `VARIANT` field after loading. ## Supported column hints Snowflake supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): -* `cluster` - creates a cluster column(s). Many column per table are supported and only when a new table is created. +* `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. ### Table and column identifiers -Snowflake makes all unquoted identifiers uppercase and then resolves them case-insensitive in SQL statements. `dlt` (effectively) does not quote identifies in DDL preserving default behavior. +Snowflake makes all unquoted identifiers uppercase and then resolves them case-insensitively in SQL statements. `dlt` (effectively) does not quote identifiers in DDL, preserving default behavior. -Names of tables and columns in [schemas](../../general-usage/schema.md) are kept in lower case like for all other destinations. This is the pattern we observed in other tools ie. `dbt`. In case of `dlt` it is however trivial to define your own uppercase [naming convention](../../general-usage/schema.md#naming-convention) +Names of tables and columns in [schemas](../../general-usage/schema.md) are kept in lower case like for all other destinations. This is the pattern we observed in other tools, i.e., `dbt`. In the case of `dlt`, it is, however, trivial to define your own uppercase [naming convention](../../general-usage/schema.md#naming-convention) ## Staging support -Snowflake supports s3 and gcs as a file staging destinations. dlt will upload files in the parquet format to the bucket provider and will ask snowflake to copy their data directly into the db. +Snowflake supports S3 and GCS as file staging destinations. dlt will upload files in the parquet format to the bucket provider and will ask Snowflake to copy their data directly into the db. -Alternavitely to parquet files, you can also specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. +Alternatively to parquet files, you can also specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. ### Snowflake and Amazon S3 -Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your bucket with the bucket_url and credentials. For s3 The dlt Redshift loader will use the aws credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively you can create a stage for your S3 Bucket by following the instructions provided in the [Snowflake S3 documentation](https://docs.snowflake.com/en/user-guide/data-load-s3-config-storage-integration). +Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your bucket with the bucket_url and credentials. For S3, the dlt Redshift loader will use the AWS credentials provided for S3 to access the S3 bucket if not specified otherwise (see config options below). Alternatively, you can create a stage for your S3 Bucket by following the instructions provided in the [Snowflake S3 documentation](https://docs.snowflake.com/en/user-guide/data-load-s3-config-storage-integration). The basic steps are as follows: * Create a storage integration linked to GCS and the right bucket -* Grant access to this storage integration to the snowflake role you are using to load the data into snowflake. +* Grant access to this storage integration to the Snowflake role you are using to load the data into Snowflake. * Create a stage from this storage integration in the PUBLIC namespace, or the namespace of the schema of your data. -* Also grant access to this stage for the role you are using to load data into snowflake. +* Also grant access to this stage for the role you are using to load data into Snowflake. * Provide the name of your stage (including the namespace) to dlt like so: -To prevent dlt from forwarding the s3 bucket credentials on every command, and set your s3 stage, change these settings: +To prevent dlt from forwarding the S3 bucket credentials on every command, and set your S3 stage, change these settings: ```toml [destination] -stage_name=PUBLIC.my_s3_stage +stage_name="PUBLIC.my_s3_stage" ``` -To run Snowflake with s3 as staging destination: +To run Snowflake with S3 as the staging destination: -```python +```py # Create a dlt pipeline that will load -# chess player data to the snowflake destination -# via staging on s3 +# chess player data to the Snowflake destination +# via staging on S3 pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='snowflake', @@ -182,25 +181,25 @@ pipeline = dlt.pipeline( ### Snowflake and Google Cloud Storage -Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your bucket with the bucket_url and credentials. For gcs you can define a stage in Snowflake and provide the stage identifier in the configuration (see config options below.) Please consult the snowflake Documentation on [how to create a stage for your GCS Bucket](https://docs.snowflake.com/en/user-guide/data-load-gcs-config). The basic steps are as follows: +Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your bucket with the bucket_url and credentials. For GCS, you can define a stage in Snowflake and provide the stage identifier in the configuration (see config options below.) Please consult the Snowflake Documentation on [how to create a stage for your GCS Bucket](https://docs.snowflake.com/en/user-guide/data-load-gcs-config). The basic steps are as follows: * Create a storage integration linked to GCS and the right bucket -* Grant access to this storage integration to the snowflake role you are using to load the data into snowflake. +* Grant access to this storage integration to the Snowflake role you are using to load the data into Snowflake. * Create a stage from this storage integration in the PUBLIC namespace, or the namespace of the schema of your data. -* Also grant access to this stage for the role you are using to load data into snowflake. +* Also grant access to this stage for the role you are using to load data into Snowflake. * Provide the name of your stage (including the namespace) to dlt like so: ```toml [destination] -stage_name=PUBLIC.my_gcs_stage +stage_name="PUBLIC.my_gcs_stage" ``` -To run Snowflake with gcs as staging destination: +To run Snowflake with GCS as the staging destination: -```python +```py # Create a dlt pipeline that will load -# chess player data to the snowflake destination -# via staging on gcs +# chess player data to the Snowflake destination +# via staging on GCS pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='snowflake', @@ -211,27 +210,27 @@ pipeline = dlt.pipeline( ### Snowflake and Azure Blob Storage -Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) to learn how to set up your bucket with the bucket_url and credentials. For azure the Snowflake loader will use -the filesystem credentials for your azure blob storage container if not specified otherwise (see config options below). Alternatively you can define an external stage in Snowflake and provide the stage identifier. -Please consult the snowflake Documentation on [how to create a stage for your Azure Blob Storage Container](https://docs.snowflake.com/en/user-guide/data-load-azure). The basic steps are as follows: +Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) to learn how to set up your bucket with the bucket_url and credentials. For Azure, the Snowflake loader will use +the filesystem credentials for your Azure Blob Storage container if not specified otherwise (see config options below). Alternatively, you can define an external stage in Snowflake and provide the stage identifier. +Please consult the Snowflake Documentation on [how to create a stage for your Azure Blob Storage Container](https://docs.snowflake.com/en/user-guide/data-load-azure). The basic steps are as follows: * Create a storage integration linked to Azure Blob Storage and the right container -* Grant access to this storage integration to the snowflake role you are using to load the data into snowflake. +* Grant access to this storage integration to the Snowflake role you are using to load the data into Snowflake. * Create a stage from this storage integration in the PUBLIC namespace, or the namespace of the schema of your data. -* Also grant access to this stage for the role you are using to load data into snowflake. +* Also grant access to this stage for the role you are using to load data into Snowflake. * Provide the name of your stage (including the namespace) to dlt like so: ```toml [destination] -stage_name=PUBLIC.my_azure_stage +stage_name="PUBLIC.my_azure_stage" ``` -To run Snowflake with azure as staging destination: +To run Snowflake with Azure as the staging destination: -```python +```py # Create a dlt pipeline that will load -# chess player data to the snowflake destination -# via staging on azure +# chess player data to the Snowflake destination +# via staging on Azure pipeline = dlt.pipeline( pipeline_name='chess_pipeline', destination='snowflake', @@ -241,7 +240,7 @@ pipeline = dlt.pipeline( ``` ## Additional destination options -You can define your own stage to PUT files and disable removing of the staged files after loading. +You can define your own stage to PUT files and disable the removal of the staged files after loading. ```toml [destination.snowflake] # Use an existing named stage instead of the default. Default uses the implicit table stage per table @@ -251,7 +250,7 @@ keep_staged_files=true ``` ### dbt support -This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake). Both password and key pair authentication is supported and shared with dbt runners. +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake). Both password and key pair authentication are supported and shared with dbt runners. ### Syncing of `dlt` state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination) @@ -266,4 +265,4 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci - [Load data from HubSpot to Snowflake in python with dlt](https://dlthub.com/docs/pipelines/hubspot/load-data-with-python-from-hubspot-to-snowflake) - [Load data from Chess.com to Snowflake in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-snowflake) - [Load data from Google Sheets to Snowflake in python with dlt](https://dlthub.com/docs/pipelines/google_sheets/load-data-with-python-from-google_sheets-to-snowflake) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index 6ace1ac5a8..d803b88a2c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -8,7 +8,7 @@ keywords: [synapse, destination, data warehouse] ## Install dlt with Synapse **To install the DLT library with Synapse dependencies:** -``` +```sh pip install dlt[synapse] ``` @@ -18,13 +18,13 @@ pip install dlt[synapse] * **Microsoft ODBC Driver for SQL Server** - _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. + The _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. This can't be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). Supported driver versions: * `ODBC Driver 18 for SQL Server` - > 💡 Older driver versions don't properly work, because they don't support the `LongAsMax` keyword that got [introduced](https://learn.microsoft.com/en-us/sql/connect/odbc/windows/features-of-the-microsoft-odbc-driver-for-sql-server-on-windows?view=sql-server-ver15#microsoft-odbc-driver-180-for-sql-server-on-windows) in `ODBC Driver 18 for SQL Server`. Synapse does not support the legacy ["long data types"](https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql), and requires "max data types" instead. `dlt` uses the `LongAsMax` keyword to automatically do the conversion. + > 💡 Older driver versions don't work properly because they don't support the `LongAsMax` keyword that was [introduced](https://learn.microsoft.com/en-us/sql/connect/odbc/windows/features-of-the-microsoft-odbc-driver-for-sql-server-on-windows?view=sql-server-ver15#microsoft-odbc-driver-180-for-sql-server-on-windows) in `ODBC Driver 18 for SQL Server`. Synapse does not support the legacy ["long data types"](https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql), and requires "max data types" instead. `dlt` uses the `LongAsMax` keyword to automatically do the conversion. * **Azure Synapse Workspace and dedicated SQL pool** You need an Azure Synapse workspace with a dedicated SQL pool to load data into. If you don't have one yet, you can use this [quickstart](https://learn.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-sql-pool-studio). @@ -32,12 +32,12 @@ pip install dlt[synapse] ### Steps **1. Initialize a project with a pipeline that loads to Synapse by running** -``` +```sh dlt init chess synapse ``` **2. Install the necessary dependencies for Synapse by running** -``` +```sh pip install -r requirements.txt ``` This will install `dlt` with the **synapse** extra that contains all dependencies required for the Synapse destination. @@ -67,7 +67,7 @@ GRANT ADMINISTER DATABASE BULK OPERATIONS TO loader; -- only required when loadi Optionally, you can create a `WORKLOAD GROUP` and add the `loader` user as a member to manage [workload isolation](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-workload-isolation). See the [instructions](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql/data-loading-best-practices#create-a-loading-user) on setting up a loader user for an example of how to do this. -**3. Enter your credentials into `.dlt/secrets.toml`.** +**4. Enter your credentials into `.dlt/secrets.toml`.** Example, replace with your database connection info: ```toml @@ -86,7 +86,7 @@ destination.synapse.credentials = "synapse://loader:your_loader_password@your_sy ``` To pass credentials directly you can use the `credentials` argument of `dlt.destinations.synapse(...)`: -```python +```py pipeline = dlt.pipeline( pipeline_name='chess', destination=dlt.destinations.synapse( @@ -97,7 +97,7 @@ pipeline = dlt.pipeline( ``` ## Write disposition -All write dispositions are supported +All write dispositions are supported. If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and replaced by the staging tables with an `ALTER SCHEMA ... TRANSFER` command. Please note that this operation is **not** atomic—it involves multiple DDL commands and Synapse does not support DDL transactions. @@ -117,7 +117,7 @@ Data is loaded via `INSERT` statements by default. ## Table index type The [table index type](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index) of the created tables can be configured at the resource level with the `synapse_adapter`: -```python +```py info = pipeline.run( synapse_adapter( data=your_resource, @@ -134,12 +134,11 @@ Possible values: > ❗ Important: >* **Set `default_table_index_type` to `"clustered_columnstore_index"` if you want to change the default** (see [additional destination options](#additional-destination-options)). >* **CLUSTERED COLUMNSTORE INDEX tables do not support the `varchar(max)`, `nvarchar(max)`, and `varbinary(max)` data types.** If you don't specify the `precision` for columns that map to any of these types, `dlt` will use the maximum lengths `varchar(4000)`, `nvarchar(4000)`, and `varbinary(8000)`. ->* **While Synapse creates CLUSTERED COLUMNSTORE INDEXES by default, `dlt` creates HEAP tables by default.** HEAP is a more robust choice, because it supports all data types and doesn't require conversions. ->* **When using the `insert-from-staging` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**—any configuration of the table index types is ignored. The HEAP strategy makes sense - for staging tables for reasons explained [here](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables). ->* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are already created with the configured table index type**, because the staging table becomes the final table. ->* **`dlt` system tables are always created as HEAP tables, regardless of any configuration.** This is in line with Microsoft's recommendation that "for small lookup tables, less than 60 million rows, consider using HEAP or clustered index for faster query performance." ->* Child tables, if any, inherent the table index type of their parent table. +>* **While Synapse creates CLUSTERED COLUMNSTORE INDEXES by default, `dlt` creates HEAP tables by default.** HEAP is a more robust choice because it supports all data types and doesn't require conversions. +>* **When using the `insert-from-staging` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**—any configuration of the table index types is ignored. The HEAP strategy makes sense for staging tables for reasons explained [here](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables). +>* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are already created with the configured table index type**, because the staging table becomes the final table. +>* **`dlt` system tables are always created as HEAP tables, regardless of any configuration.** This is in line with Microsoft's recommendation that "for small lookup tables, less than 60 million rows, consider using HEAP or clustered index for faster query performance." +>* Child tables, if any, inherit the table index type of their parent table. ## Supported column hints @@ -148,7 +147,7 @@ Synapse supports the following [column hints](https://dlthub.com/docs/general-us * `primary_key` - creates a `PRIMARY KEY NONCLUSTERED NOT ENFORCED` constraint on the column * `unique` - creates a `UNIQUE NOT ENFORCED` constraint on the column -> ❗ These hints are **disabled by default**. This is because the `PRIMARY KEY` and `UNIQUE` [constraints](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints) are tricky in Synapse: they are **not enforced** and can lead to innacurate results if the user does not ensure all column values are unique. For the column hints to take effect, the `create_indexes` configuration needs to be set to `True`, see [additional destination options](#additional-destination-options). +> ❗ These hints are **disabled by default**. This is because the `PRIMARY KEY` and `UNIQUE` [constraints](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints) are tricky in Synapse: they are **not enforced** and can lead to inaccurate results if the user does not ensure all column values are unique. For the column hints to take effect, the `create_indexes` configuration needs to be set to `True`, see [additional destination options](#additional-destination-options). ## Staging support Synapse supports Azure Blob Storage (both standard and [ADLS Gen2](https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction)) as a file staging destination. `dlt` first uploads Parquet files to the blob container, and then instructs Synapse to read the Parquet file and load its data into a Synapse table using the [COPY INTO](https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql) statement. @@ -157,7 +156,7 @@ Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.m To run Synapse with staging on Azure Blob Storage: -```python +```py # Create a dlt pipeline that will load # chess player data to the snowflake destination # via staging on Azure Blob Storage @@ -190,9 +189,9 @@ destination.synapse.credentials = "synapse://loader:your_loader_password@your_sy ``` Descriptions: -- `default_table_index_type` sets the [table index type](#table-index-type) that is used if no table index type is specified on the resource. +- `default_table_index_type` sets the [table index type](#table-index-type) that is used if no table index type is specified on the resource. - `create_indexes` determines if `primary_key` and `unique` [column hints](#supported-column-hints) are applied. -- `staging_use_msi` determines if the Managed Identity of the Synapse workspace is used to authorize access to the [staging](#staging-support) Storage Account. Ensure the Managed Identity has the [Storage Blob Data Reader](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#storage-blob-data-reader) role (or a higher-priviliged role) assigned on the blob container if you set this option to `"true"`. +- `staging_use_msi` determines if the Managed Identity of the Synapse workspace is used to authorize access to the [staging](#staging-support) Storage Account. Ensure the Managed Identity has the [Storage Blob Data Reader](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#storage-blob-data-reader) role (or a higher-privileged role) assigned on the blob container if you set this option to `"true"`. - `port` used for the ODBC connection. - `connect_timeout` sets the timeout for the `pyodbc` connection attempt, in seconds. @@ -212,4 +211,4 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci - [Load data from GitHub to Azure Synapse in python with dlt](https://dlthub.com/docs/pipelines/github/load-data-with-python-from-github-to-synapse) - [Load data from Stripe to Azure Synapse in python with dlt](https://dlthub.com/docs/pipelines/stripe_analytics/load-data-with-python-from-stripe_analytics-to-synapse) - [Load data from Chess.com to Azure Synapse in python with dlt](https://dlthub.com/docs/pipelines/chess/load-data-with-python-from-chess-to-synapse) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 2ec09e9c24..fb87ccfa6f 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -6,14 +6,14 @@ keywords: [weaviate, vector database, destination, dlt] # Weaviate -[Weaviate](https://weaviate.io/) is an open source vector database. It allows you to store data objects and perform similarity searches over them. -This destination helps you to load data into Weaviate from [dlt resources](../../general-usage/resource.md). +[Weaviate](https://weaviate.io/) is an open-source vector database. It allows you to store data objects and perform similarity searches over them. +This destination helps you load data into Weaviate from [dlt resources](../../general-usage/resource.md). ## Setup Guide 1. To use Weaviate as a destination, make sure dlt is installed with the 'weaviate' extra: -```bash +```sh pip install dlt[weaviate] ``` @@ -30,18 +30,18 @@ X-OpenAI-Api-Key = "your-openai-api-key" In this setup guide, we are using the [Weaviate Cloud Services](https://console.weaviate.cloud/) to get a Weaviate instance and [OpenAI API](https://platform.openai.com/) for generating embeddings through the [text2vec-openai](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-openai) module. -You can host your own weaviate instance using docker compose, kubernetes or embedded. Refer to Weaviate's [How-to: Install](https://weaviate.io/developers/weaviate/installation) or [dlt recipe we use for our tests](#run-weaviate-fully-standalone). In that case you can skip the credentials part altogether: +You can host your own Weaviate instance using Docker Compose, Kubernetes, or embedded. Refer to Weaviate's [How-to: Install](https://weaviate.io/developers/weaviate/installation) or [dlt recipe we use for our tests](#run-weaviate-fully-standalone). In that case, you can skip the credentials part altogether: ```toml [destination.weaviate.credentials.additional_headers] X-OpenAI-Api-Key = "your-openai-api-key" ``` -The `url` will default to **http://localhost:8080** and `api_key` is not defined - which are the defaults for Weaviate container. +The `url` will default to **http://localhost:8080** and `api_key` is not defined - which are the defaults for the Weaviate container. 3. Define the source of the data. For starters, let's load some data from a simple data structure: -```python +```py import dlt from dlt.destinations.adapters import weaviate_adapter @@ -63,7 +63,7 @@ movies = [ 4. Define the pipeline: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="weaviate", @@ -73,7 +73,7 @@ pipeline = dlt.pipeline( 5. Run the pipeline: -```python +```py info = pipeline.run( weaviate_adapter( movies, @@ -84,7 +84,7 @@ info = pipeline.run( 6. Check the results: -```python +```py print(info) ``` @@ -96,12 +96,12 @@ Weaviate destination is different from other [dlt destinations](../destinations/ The `weaviate_adapter` is a helper function that configures the resource for the Weaviate destination: -```python +```py weaviate_adapter(data, vectorize, tokenization) ``` It accepts the following arguments: -- `data`: a dlt resource object or a Python data structure (e.g. a list of dictionaries). +- `data`: a dlt resource object or a Python data structure (e.g., a list of dictionaries). - `vectorize`: a name of the field or a list of names that should be vectorized by Weaviate. - `tokenization`: the dictionary containing the tokenization configuration for a field. The dictionary should have the following structure `{'field_name': 'method'}`. Valid methods are "word", "lowercase", "whitespace", "field". The default is "word". See [Property tokenization](https://weaviate.io/developers/weaviate/config-refs/schema#property-tokenization) in Weaviate documentation for more details. @@ -109,7 +109,7 @@ Returns: a [dlt resource](../../general-usage/resource.md) object that you can p Example: -```python +```py weaviate_adapter( resource, vectorize=["title", "description"], @@ -133,7 +133,7 @@ The [replace](../../general-usage/full-loading.md) disposition replaces the data In the movie example from the [setup guide](#setup-guide), we can use the `replace` disposition to reload the data every time we run the pipeline: -```python +```py info = pipeline.run( weaviate_adapter( movies, @@ -146,9 +146,9 @@ info = pipeline.run( ### Merge The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data in the destination. -For `merge` disposition you would need to specify a `primary_key` for the resource: +For the `merge` disposition, you would need to specify a `primary_key` for the resource: -```python +```py info = pipeline.run( weaviate_adapter( movies, @@ -159,18 +159,18 @@ info = pipeline.run( ) ``` -Internally dlt will use `primary_key` (`document_id` in the example above) to generate a unique identifier ([UUID](https://weaviate.io/developers/weaviate/manage-data/create#id)) for each object in Weaviate. If the object with the same UUID already exists in Weaviate, it will be updated with the new data. Otherwise, a new object will be created. +Internally, dlt will use `primary_key` (`document_id` in the example above) to generate a unique identifier ([UUID](https://weaviate.io/developers/weaviate/manage-data/create#id)) for each object in Weaviate. If the object with the same UUID already exists in Weaviate, it will be updated with the new data. Otherwise, a new object will be created. :::caution -If you are using the merge write disposition, you must set it from the first run of your pipeline, otherwise the data will be duplicated in the database on subsequent loads. +If you are using the `merge` write disposition, you must set it from the first run of your pipeline; otherwise, the data will be duplicated in the database on subsequent loads. ::: ### Append -This is the default disposition. It will append the data to the existing data in the destination ignoring the `primary_key` field. +This is the default disposition. It will append the data to the existing data in the destination, ignoring the `primary_key` field. ## Data loading @@ -199,11 +199,11 @@ Weaviate uses classes to categorize and identify data. To avoid potential naming For example, if you have a dataset named `movies_dataset` and a table named `actors`, the Weaviate class name would be `MoviesDataset_Actors` (the default separator is an underscore). -However, if you prefer to have class names without the dataset prefix, skip `dataset_name` argument. +However, if you prefer to have class names without the dataset prefix, skip the `dataset_name` argument. For example: -```python +```py pipeline = dlt.pipeline( pipeline_name="movies", destination="weaviate", @@ -241,15 +241,15 @@ The default naming convention described above will preserve the casing of the pr in Weaviate but also requires that your input data does not have clashing property names when comparing case insensitive ie. (`caseName` == `casename`). In such case Weaviate destination will fail to create classes and report a conflict. -You can configure alternative naming convention which will lowercase all properties. The clashing properties will be merged and the classes created. Still if you have a document where clashing properties like: +You can configure an alternative naming convention which will lowercase all properties. The clashing properties will be merged and the classes created. Still, if you have a document where clashing properties like: ```json {"camelCase": 1, "CamelCase": 2} ``` it will be normalized to: -``` +```json {"camelcase": 2} ``` -so your best course of action is to clean up the data yourself before loading and use default naming convention. Nevertheless you can configure the alternative in `config.toml`: +so your best course of action is to clean up the data yourself before loading and use the default naming convention. Nevertheless, you can configure the alternative in `config.toml`: ```toml [schema] naming="dlt.destinations.weaviate.impl.ci_naming" @@ -291,12 +291,12 @@ Below is an example that configures the **contextionary** vectorizer. You can pu vectorizer="text2vec-contextionary" module_config={text2vec-contextionary = { vectorizeClassName = false, vectorizePropertyName = true}} ``` -You can find docker composer with the instructions to run [here](https://github.com/dlt-hub/dlt/tree/devel/dlt/destinations/weaviate/README.md) +You can find Docker Compose with the instructions to run [here](https://github.com/dlt-hub/dlt/tree/devel/dlt/destinations/weaviate/README.md) ### dbt support -Currently Weaviate destination does not support dbt. +Currently, Weaviate destination does not support dbt. ### Syncing of `dlt` state @@ -304,4 +304,4 @@ Weaviate destination supports syncing of the `dlt` state. - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md index a6d9fe78b6..641be9a106 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/insert-format.md @@ -4,30 +4,27 @@ description: The INSERT file format keywords: [insert values, file formats] --- -# SQL INSERT file format +# SQL INSERT File Format -This file format contains an INSERT...VALUES statement to be executed on the destination during the -`load` stage. +This file format contains an INSERT...VALUES statement to be executed on the destination during the `load` stage. Additional data types are stored as follows: -- `datetime` and `date` as ISO strings; -- `decimal` as text representation of decimal number; -- `binary` depends on the format accepted by the destination; -- `complex` depends on the format accepted by the destination. +- `datetime` and `date` are stored as ISO strings; +- `decimal` is stored as a text representation of a decimal number; +- `binary` storage depends on the format accepted by the destination; +- `complex` storage also depends on the format accepted by the destination. -This file format is -[compressed](../../reference/performance.md#disabling-and-enabling-file-compression) by default. +This file format is [compressed](../../reference/performance.md#disabling-and-enabling-file-compression) by default. -## Supported destinations +## Supported Destinations -Used by default by: **DuckDB**, **Postgres**, **Redshift**. +This format is used by default by: **DuckDB**, **Postgres**, **Redshift**. -Supported by: **filesystem**. +It is also supported by: **filesystem**. -By setting the `loader_file_format` argument to `insert_values` in the run command, the pipeline -will store your data in the INSERT format to the destination: +By setting the `loader_file_format` argument to `insert_values` in the run command, the pipeline will store your data in the INSERT format at the destination: -```python +```py info = pipeline.run(some_source(), loader_file_format="insert_values") ``` diff --git a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md index 34f636f88d..7467c6f639 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/jsonl.md @@ -4,29 +4,29 @@ description: The jsonl file format keywords: [jsonl, file formats] --- -# jsonl - JSON delimited +# jsonl - JSON Delimited -`JSON delimited` is a file format that stores several `JSON` documents in one file. The `JSON` +JSON Delimited is a file format that stores several JSON documents in one file. The JSON documents are separated by a new line. Additional data types are stored as follows: -- `datetime` and `date` as ISO strings; -- `decimal` as text representation of decimal number; -- `binary` is base64 encoded string; -- `HexBytes` is hex encoded string; +- `datetime` and `date` are stored as ISO strings; +- `decimal` is stored as a text representation of a decimal number; +- `binary` is stored as a base64 encoded string; +- `HexBytes` is stored as a hex encoded string; - `complex` is serialized as a string. This file format is [compressed](../../reference/performance.md#disabling-and-enabling-file-compression) by default. -## Supported destinations +## Supported Destinations -Used by default by: **BigQuery**, **Snowflake**, **filesystem**. +This format is used by default by: **BigQuery**, **Snowflake**, **filesystem**. By setting the `loader_file_format` argument to `jsonl` in the run command, the pipeline will store -your data in the jsonl format to the destination: +your data in the jsonl format at the destination: -```python +```py info = pipeline.run(some_source(), loader_file_format="jsonl") ``` diff --git a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md index 4b0f63d22b..94aaaf4884 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md @@ -4,52 +4,41 @@ description: The parquet file format keywords: [parquet, file formats] --- -# Parquet file format +# Parquet File Format -[Apache Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) is a free and open-source -column-oriented data storage format in the Apache Hadoop ecosystem. `dlt` is able to store data in -this format when configured to do so. +[Apache Parquet](https://en.wikipedia.org/wiki/Apache_Parquet) is a free and open-source column-oriented data storage format in the Apache Hadoop ecosystem. `dlt` is capable of storing data in this format when configured to do so. -To use this format you need a `pyarrow` package. You can get this package as a `dlt` extra as well: +To use this format, you need a `pyarrow` package. You can get this package as a `dlt` extra as well: ```sh pip install dlt[parquet] ``` -## Supported destinations +## Supported Destinations Supported by: **BigQuery**, **DuckDB**, **Snowflake**, **filesystem**, **Athena** -By setting the `loader_file_format` argument to `parquet` in the run command, the pipeline will -store your data in the parquet format to the destination: +By setting the `loader_file_format` argument to `parquet` in the run command, the pipeline will store your data in the parquet format at the destination: -```python +```py info = pipeline.run(some_source(), loader_file_format="parquet") ``` ## Destination AutoConfig -`dlt` uses [destination capabilities](../../walkthroughs/create-new-destination.md#3-set-the-destination-capabilities) to configure parquet writer: -* uses decimal and wei precision to pick the right **decimal type** and sets precision and scale -* uses timestamp precision to pick right **timestamp type** resolution (seconds, micro or nano) +`dlt` uses [destination capabilities](../../walkthroughs/create-new-destination.md#3-set-the-destination-capabilities) to configure the parquet writer: +* It uses decimal and wei precision to pick the right **decimal type** and sets precision and scale. +* It uses timestamp precision to pick the right **timestamp type** resolution (seconds, micro, or nano). ## Options -Under the hood `dlt` uses the -[pyarrow parquet writer](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) -to create the files. The following options can be used to change the behavior of the writer: +Under the hood, `dlt` uses the [pyarrow parquet writer](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) to create the files. The following options can be used to change the behavior of the writer: -- `flavor`: Sanitize schema or set other compatibility options to work with various target systems. - Defaults to "spark". -- `version`: Determine which Parquet logical types are available for use, whether the reduced set - from the Parquet 1.x.x format or the expanded logical types added in later format versions. - Defaults to "2.4". -- `data_page_size`: Set a target threshold for the approximate encoded size of data pages within a - column chunk (in bytes). Defaults to "1048576". -- `timestamp_timezone`: A string specifying timezone, default is UTC +- `flavor`: Sanitize schema or set other compatibility options to work with various target systems. Defaults to "spark". +- `version`: Determine which Parquet logical types are available for use, whether the reduced set from the Parquet 1.x.x format or the expanded logical types added in later format versions. Defaults to "2.4". +- `data_page_size`: Set a target threshold for the approximate encoded size of data pages within a column chunk (in bytes). Defaults to "1048576". +- `timestamp_timezone`: A string specifying timezone, default is UTC. -Read the -[pyarrow parquet docs](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) -to learn more about these settings. +Read the [pyarrow parquet docs](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html) to learn more about these settings. Example: @@ -62,9 +51,9 @@ data_page_size=1048576 timestamp_timezone="Europe/Berlin" ``` -or using environment variables: +Or using environment variables: -``` +```sh NORMALIZE__DATA_WRITER__FLAVOR NORMALIZE__DATA_WRITER__VERSION NORMALIZE__DATA_WRITER__DATA_PAGE_SIZE diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md index d2ed03a2a2..e3a60dfa51 100644 --- a/docs/website/docs/dlt-ecosystem/staging.md +++ b/docs/website/docs/dlt-ecosystem/staging.md @@ -48,7 +48,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel 4. **Chain staging to destination and request `parquet` file format.** Pass the `staging` argument to `dlt.pipeline`. It works like the destination `argument`: - ```python + ```py # Create a dlt pipeline that will load # chess player data to the redshift destination # via staging on s3 @@ -60,7 +60,7 @@ In essence, you need to set up two destinations and then pass them to `dlt.pipel ) ``` `dlt` will automatically select an appropriate loader file format for the staging files. Below we explicitly specify `parquet` file format (just to demonstrate how to do it): - ```python + ```py info = pipeline.run(chess(), loader_file_format="parquet") ``` diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index b2b6b27fc3..42f31d4875 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -6,34 +6,34 @@ keywords: [transform, dbt, runner] # Transform the data with dbt -[dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows simple structuring of your transformations into DAGs. The benefits of +[dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows for the simple structuring of your transformations into DAGs. The benefits of using dbt include: - End-to-end cross-db compatibility for dlt→dbt pipelines. -- Easy to use by SQL analysts, low learning curve. -- Highly flexible and configurable in usage, supports templating, can run backfills etc. -- Supports testing and accelerates troubleshooting. +- Ease of use by SQL analysts, with a low learning curve. +- High flexibility and configurability in usage, supports templating, can run backfills, etc. +- Support for testing and accelerated troubleshooting. ## dbt runner in dlt You can run dbt with `dlt` by using the dbt runner. -The dbt runner +The dbt runner: -- can create a virtual env for dbt on the fly; -- can run a dbt package from online (e.g. GitHub) or from local files; -- passes configuration and credentials to dbt, so you do not need to handle them separately from +- Can create a virtual env for dbt on the fly; +- Can run a dbt package from online sources (e.g., GitHub) or from local files; +- Passes configuration and credentials to dbt, so you do not need to handle them separately from `dlt`, enabling dbt to configure on the fly. ## How to use the dbt runner For an example of how to use the dbt runner, see the [jaffle shop example](https://github.com/dlt-hub/dlt/blob/devel/docs/examples/archive/dbt_run_jaffle.py). -Included below in another example where we run a `dlt` pipeline and then a dbt package via `dlt`: +Included below is another example where we run a `dlt` pipeline and then a dbt package via `dlt`: > 💡 Docstrings are available to read in your IDE. -```python +```py # load all pipedrive endpoints to pipedrive_raw dataset pipeline = dlt.pipeline( pipeline_name='pipedrive', @@ -81,7 +81,7 @@ for m in models: ``` ## How to run dbt runner without pipeline -You can use dbt runner without dlt pipeline. Example below will clone and run **jaffle shop** using a dbt profile that you supply. +You can use the dbt runner without a dlt pipeline. The example below will clone and run **jaffle shop** using a dbt profile that you supply. It assumes that dbt is installed in the current Python environment and the `profile.yml` is in the same folder as the Python script. ```py @@ -102,7 +102,7 @@ models = runner.run_all() ``` -Here's example **duckdb** profile +Here's an example **duckdb** profile ```yaml config: # do not track usage, do not create .user.yml @@ -128,4 +128,4 @@ If you want to transform the data before loading, you can use Python. If you wan data after loading, you can use dbt or one of the following: 1. [`dlt` SQL client.](../sql.md) -1. [Pandas.](../pandas.md) +2. [Pandas.](../pandas.md) diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md index 1f658e4f95..d15c4eb84c 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt_cloud.md @@ -1,5 +1,5 @@ --- -title: Transforming the data with dbt Cloud +title: Transforming the Data with dbt Cloud description: Transforming the data loaded by a dlt pipeline with dbt Cloud keywords: [transform, sql] --- @@ -9,9 +9,9 @@ keywords: [transform, sql] ## API Client The DBT Cloud Client is a Python class designed to interact with the dbt Cloud API (version 2). -It provides methods to perform various operations on dbt Cloud, such as triggering job runs and retrieving job run status. +It provides methods to perform various operations on dbt Cloud, such as triggering job runs and retrieving job run statuses. -```python +```py from dlt.helpers.dbt_cloud import DBTCloudClientV2 # Initialize the client @@ -26,7 +26,7 @@ run_status = client.get_run_status(run_id=job_run_id) print(f"Job run status: {run_status['status_humanized']}") ``` -## Helper functions +## Helper Functions These Python functions provide an interface to interact with the dbt Cloud API. They simplify the process of triggering and monitoring job runs in dbt Cloud. @@ -36,7 +36,7 @@ They simplify the process of triggering and monitoring job runs in dbt Cloud. This function triggers a job run in dbt Cloud using the specified configuration. It supports various customization options and allows for monitoring the job's status. -```python +```py from dlt.helpers.dbt_cloud import run_dbt_cloud_job # Trigger a job run with default configuration @@ -53,19 +53,19 @@ status = run_dbt_cloud_job(job_id=1234, data=additional_data, wait_for_outcome=T ### `get_dbt_cloud_run_status()` -If you have already started job run and have a run ID, then you can use the `get_dbt_cloud_run_status` function. +If you have already started a job run and have a run ID, then you can use the `get_dbt_cloud_run_status` function. This function retrieves the full information about a specific dbt Cloud job run. It also supports options for waiting until the run is complete. -```python +```py from dlt.helpers.dbt_cloud import get_dbt_cloud_run_status # Retrieve status for a specific run status = get_dbt_cloud_run_status(run_id=1234, wait_for_outcome=True) ``` -## Set credentials +## Set Credentials ### secrets.toml @@ -74,29 +74,29 @@ When using a dlt locally, we recommend using the `.dlt/secrets.toml` method to s If you used the `dlt init` command, then the `.dlt` folder has already been created. Otherwise, create a `.dlt` folder in your working directory and a `secrets.toml` file inside it. -It's where you store sensitive information securely, like access tokens. Keep this file safe. +This is where you store sensitive information securely, like access tokens. Keep this file safe. Use the following format for dbt Cloud API authentication: ```toml [dbt_cloud] api_token = "set me up!" # required for authentication -account_id = "set me up!" # required for both helpers function -job_id = "set me up!" # optional only for run_dbt_cloud_job function (you can pass this explicitly as an argument to the function) -run_id = "set me up!" # optional for get_dbt_cloud_run_status (you can pass this explicitly as an argument to the function) +account_id = "set me up!" # required for both helper functions +job_id = "set me up!" # optional only for the run_dbt_cloud_job function (you can pass this explicitly as an argument to the function) +run_id = "set me up!" # optional for the get_dbt_cloud_run_status function (you can pass this explicitly as an argument to the function) ``` -### Environment variables +### Environment Variables -`dlt` supports reading credentials from environment. +`dlt` supports reading credentials from the environment. If dlt tries to read this from environment variables, it will use a different naming convention. -For environment variables all names are capitalized and sections are separated with double underscore "__". +For environment variables, all names are capitalized and sections are separated with a double underscore "__". -For example, for the above secrets, we would need to put into environment: +For example, for the above secrets, we would need to put into the environment: -``` +```sh DBT_CLOUD__API_TOKEN DBT_CLOUD__ACCOUNT_ID DBT_CLOUD__JOB_ID diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md index 6ab98090ba..5a82d8be66 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ b/docs/website/docs/dlt-ecosystem/transformations/pandas.md @@ -4,14 +4,14 @@ description: Transform the data loaded by a dlt pipeline with Pandas keywords: [transform, pandas] --- -# Transform the data with Pandas +# Transform the Data with Pandas -You can fetch results of any SQL query as a dataframe. If the destination is supporting that -natively (i.e. BigQuery and DuckDB), `dlt` uses the native method. Thanks to that, reading -dataframes may be really fast! The example below reads GitHub reactions data from the `issues` table and -counts reaction types. +You can fetch the results of any SQL query as a dataframe. If the destination supports that +natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to this, reading +dataframes can be really fast! The example below reads GitHub reactions data from the `issues` table and +counts the reaction types. -```python +```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", @@ -27,15 +27,15 @@ with pipeline.sql_client() as client: counts = reactions.sum(0).sort_values(0, ascending=False) ``` -The `df` method above returns all the data in the cursor as data frame. You can also fetch data in -chunks by passing `chunk_size` argument to the `df` method. +The `df` method above returns all the data in the cursor as a data frame. You can also fetch data in +chunks by passing the `chunk_size` argument to the `df` method. Once your data is in a Pandas dataframe, you can transform it as needed. -## Other transforming tools +## Other Transforming Tools If you want to transform the data before loading, you can use Python. If you want to transform the data after loading, you can use Pandas or one of the following: 1. [dbt.](dbt/dbt.md) (recommended) -1. [`dlt` SQL client.](sql.md) +2. [`dlt` SQL client.](sql.md) diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index cc1576229b..ad37c61bd8 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -6,36 +6,40 @@ keywords: [transform, sql] # Transform the data using the `dlt` SQL client -A simple alternative to dbt is to query the data using the `dlt` SQL client and then performing the +A simple alternative to dbt is to query the data using the `dlt` SQL client and then perform the transformations using Python. The `execute_sql` method allows you to execute any SQL statement, -including statements that change the database schema or data in the tables. In the example below we -insert a row into `customers` table. Note that the syntax is the same as for any standard `dbapi` +including statements that change the database schema or data in the tables. In the example below, we +insert a row into the `customers` table. Note that the syntax is the same as for any standard `dbapi` connection. -```python +```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") try: with pipeline.sql_client() as client: client.sql_client.execute_sql( - f"INSERT INTO customers VALUES (%s, %s, %s)", + "INSERT INTO customers VALUES (%s, %s, %s)", 10, "Fred", "fred@fred.com" ) +except Exception: + ... ``` -In the case of SELECT queries, the data is returned as a list of row, with the elements of a row +In the case of SELECT queries, the data is returned as a list of rows, with the elements of a row corresponding to selected columns. -```python +```py try: with pipeline.sql_client() as client: res = client.execute_sql( "SELECT id, name, email FROM customers WHERE id = %s", 10 ) - # prints columns values of first row + # prints column values of the first row print(res[0]) +except Exception: + ... ``` ## Other transforming tools @@ -44,4 +48,4 @@ If you want to transform the data before loading, you can use Python. If you wan data after loading, you can use SQL or one of the following: 1. [dbt](dbt/dbt.md) (recommended). -1. [Pandas.](pandas.md) +2. [Pandas.](pandas.md) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md index 0baf1917d1..a920b21a03 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md @@ -45,7 +45,7 @@ Sources and resources that can be loaded using this verified source are: Upon logging into Airtable and accessing your base or table, you'll notice a URL in your browser's address bar resembling: -```bash +```sh https://airtable.com/appve10kl227BIT4GV/tblOUnZVLFWbemTP1/viw3qtF76bRQC3wKx/rec9khXgeTotgCQ62?blocks=hide ``` @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init airtable duckdb ``` @@ -116,20 +116,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python airtable_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -147,13 +147,14 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function retrieves tables from given Airtable base. -```python +```py @dlt.source def airtable_source( base_id: str = dlt.config.value, table_names: Optional[List[str]] = None, access_token: str = dlt.secrets.value, ) -> Iterable[DltResource]: + ... ``` `base_id`: The base's unique identifier. @@ -167,12 +168,13 @@ tables in the schema are loaded. This function retrieves data from a single Airtable table. -```python +```py def airtable_resource( api: pyairtable.Api, base_id: str, table: Dict[str, Any], ) -> DltResource: + ... ``` `table`: Airtable metadata, excluding actual records. @@ -186,7 +188,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="airtable", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -196,16 +198,16 @@ verified source. 1. To load the entire base: - ```python + ```py base_id = "Please set me up!" # The id of the base. - airtables = airtable_source(base_id=base_id)) + airtables = airtable_source(base_id=base_id) load_info = pipeline.run(load_data, write_disposition="replace") ``` 1. To load selected tables from a base table: - ```python + ```py base_id = "Please set me up!" # The id of the base. table_names = ["Table1","Table2"] # A list of table IDs or table names to load. @@ -221,7 +223,7 @@ verified source. 1. To load data and apply hints to a specific column: - ```python + ```py base_id = "Please set me up!" # The id of the base. table_names = ["Table1","Table2"] # A list of table IDs or table names to load. resource_name = "Please set me up!" # The table name we want to apply hints. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md index 2fb97ff320..2894c15b5e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/amazon_kinesis.md @@ -9,7 +9,7 @@ keywords: [amazon kinesis, verified source] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Amazon Kinesis](https://docs.aws.amazon.com/streams/latest/dev/key-concepts.html) is a cloud-based @@ -36,7 +36,7 @@ You can check out our pipeline example ### Grab credentials -To use this verified source you need AWS `Access key` and `Secret access key`, that can be obtained +To use this verified source, you need an AWS `Access key` and `Secret access key`, which can be obtained as follows: 1. Sign in to your AWS Management Console. @@ -57,7 +57,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init kinesis duckdb ``` @@ -110,19 +110,19 @@ For more information, read [Credentials](../../general-usage/credentials). 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python kinesis_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `kinesis_pipeline`, you may + For example, the `pipeline_name` for the above pipeline example is `kinesis_pipeline`. You may also use any custom name instead. For more information, read [Run a pipeline.](../../walkthroughs/run-a-pipeline) @@ -138,7 +138,7 @@ This resource reads a Kinesis stream and yields messages. It supports [incremental loading](../../general-usage/incremental-loading) and parses messages as json by default. -```python +```py @dlt.resource( name=lambda args: args["stream_name"], primary_key="_kinesis_msg_id", @@ -156,6 +156,7 @@ def kinesis_stream( parse_json: bool = True, chunk_size: int = 1000, ) -> Iterable[TDataItem]: + ... ``` `stream_name`: Name of the Kinesis stream. Defaults to config/secrets if unspecified. @@ -178,7 +179,7 @@ def kinesis_stream( You create a resource `kinesis_stream` by passing the stream name and a few other options. The resource will have the same name as the stream. When you iterate this resource (or pass it to -`pipeline.run` records) it will query Kinesis for all the shards in the requested stream. For each +`pipeline.run` records), it will query Kinesis for all the shards in the requested stream. For each shard, it will create an iterator to read messages: 1. If `initial_at_timestamp` is present, the resource will read all messages after this timestamp. @@ -192,7 +193,7 @@ will load messages incrementally: 1. For shards that didn't have messages (or new shards), the last run time is used to get messages. Please check the `kinesis_stream` [docstring](https://github.com/dlt-hub/verified-sources/blob/master/sources/kinesis/__init__.py#L31-L46) -for additional options, i.e. to limit the number of messages +for additional options, i.e., to limit the number of messages returned or to automatically parse JSON messages. ### Kinesis message format @@ -212,7 +213,7 @@ verified source. 1. Configure the [pipeline](../../general-usage/pipeline) by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="kinesis_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -220,9 +221,9 @@ verified source. ) ``` -1. To load messages from a stream from last one hour: +1. To load messages from a stream from the last one hour: - ```python + ```py # the resource below will take its name from the stream name, # it can be used multiple times by default it assumes that Data is json and parses it, # here we disable that to just get bytes in data elements of the message @@ -237,7 +238,7 @@ verified source. 1. For incremental Kinesis streams, to fetch only new messages: - ```python + ```py #running pipeline will get only new messages info = pipeline.run(kinesis_stream_data) message_counts = pipeline.last_trace.last_normalize_info.row_counts @@ -249,7 +250,7 @@ verified source. 1. To parse json with a simple decoder: - ```python + ```py def _maybe_parse_json(item: TDataItem) -> TDataItem: try: item.update(json.loadb(item["data"])) @@ -263,7 +264,7 @@ verified source. 1. To read Kinesis messages and send them somewhere without using a pipeline: - ```python + ```py from dlt.common.configuration.container import Container from dlt.common.pipeline import StateInjectableContext diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md index df968422d7..915a9d297a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md @@ -25,7 +25,7 @@ To write an Arrow source, pass any `pyarrow.Table`, `pyarrow.RecordBatch` or `pa This example loads a Pandas dataframe to a Snowflake table: -```python +```py import dlt from dlt.common import pendulum import pandas as pd @@ -45,7 +45,7 @@ pipeline.run(df, table_name="orders") A `pyarrow` table can be loaded in the same way: -```python +```py import pyarrow as pa # Create dataframe and pipeline same as above @@ -96,7 +96,7 @@ Usage is the same as without other dlt resources. Refer to the [incremental load Example: -```python +```py import dlt from dlt.common import pendulum import pandas as pd @@ -144,7 +144,7 @@ All struct types are represented as `complex` and will be loaded as JSON (if des even if they are present in the destination. If you want to represent nested data as separated tables, you must yield panda frames and arrow tables as records. In the examples above: -```python +```py # yield panda frame as records pipeline.run(df.to_dict(orient='records'), table_name="orders") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md index 8554cdd376..9e3ee9c8fe 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md @@ -56,7 +56,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init asana_dlt duckdb ``` @@ -94,16 +94,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python asana_dlt_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `asana`, you may also use any @@ -127,7 +127,7 @@ it is important to note the complete list of the default endpoints given in This is a `dlt.source` function, which returns a list of DltResource objects: "workspaces", "projects", "sections","tags","tasks","stories", "teams", and "users". -```python +```py @dlt.source def asana_source(access_token: str = dlt.secrets.value) -> Any: return [ @@ -142,7 +142,7 @@ def asana_source(access_token: str = dlt.secrets.value) -> Any: This is a `dlt.resource` function, which returns collections of tasks and related information. -```python +```py @dlt.resource(write_disposition="replace") def workspaces( access_token: str = dlt.secrets.value, @@ -171,7 +171,7 @@ transformer functions transform or process data from one or more resources. The transformer function `projects` process data from the `workspaces` resource. It fetches and returns a list of projects for a given workspace from Asana. -```python +```py @dlt.transformer( data_from=workspaces, write_disposition="replace", @@ -200,7 +200,7 @@ It uses `@dlt.defer` decorator to enable parallel run in thread pool. This [incremental](../../general-usage/incremental-loading.md) resource-transformer fetches all tasks for a given project from Asana. -```python +```py @dlt.transformer(data_from=projects, write_disposition="merge", primary_key="gid") def tasks( project_array: t.List[TDataItem], @@ -235,7 +235,7 @@ these steps: 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="asana_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -248,13 +248,13 @@ these steps: 1. To load the data from all the fields, you can utilise the `asana_source` method as follows: - ```python + ```py load_data = asana_source() ``` 1. Use the method `pipeline.run()` to execute the pipeline. - ```python + ```py load_info = pipeline.run(load_data) # print the information on data that was loaded print(load_info) @@ -263,7 +263,7 @@ these steps: 1. To use the method `pipeline.run()` to load custom endpoints “workspaces” and “projects”, the above script may be modified as: - ```python + ```py load_info = pipeline.run(load_data.with_resources("workspaces", "projects")) # print the information on data that was loaded print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md index 7f01b83f08..2341680d97 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md @@ -36,7 +36,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init chess duckdb ``` @@ -66,20 +66,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python chess_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -98,7 +98,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This is a `dlt.source` function for the Chess.com API named "chess", which returns a sequence of DltResource objects. That we'll discuss in subsequent sections as resources. -```python +```py dlt.source(name="chess") def source( players: List[str], start_month: str = None, end_month: str = None @@ -120,7 +120,7 @@ to fetch game data (in "YYYY/MM" format). This is a `dlt.resource` function, which returns player profiles for a list of player usernames. -```python +```py @dlt.resource(write_disposition="replace") def players_profiles(players: List[str]) -> Iterator[TDataItem]: @@ -138,7 +138,7 @@ It uses `@dlt.defer` decorator to enable parallel run in thread pool. This is a `dlt.resource` function, which returns url to game archives for specified players. -```python +```py @dlt.resource(write_disposition="replace", selected=False) def players_archives(players: List[str]) -> Iterator[List[TDataItem]]: ... @@ -154,7 +154,7 @@ runs. This incremental resource takes data from players and returns games for the last month if not specified otherwise. -```python +```py @dlt.resource(write_disposition="append") def players_games( players: List[str], start_month: str = None, end_month: str = None @@ -186,7 +186,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="chess_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -199,7 +199,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. To load the data from all the resources for specific players (e.g. for November), you can utilise the `source` method as follows: - ```python + ```py # Loads games for Nov 2022 data = source( ["magnuscarlsen", "vincentkeymer", "dommarajugukesh", "rpragchess"], @@ -210,7 +210,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. Use the method `pipeline.run()` to execute the pipeline. - ```python + ```py info = pipeline.run(data) # print the information on data that was loaded print(info) @@ -219,7 +219,7 @@ To create your data loading pipeline for players and load data, follow these ste 1. To load data from specific resources like "players_games" and "player_profiles", modify the above code as: - ```python + ```py info = pipeline.run(data.with_resources("players_games", "players_profiles")) # print the information on data that was loaded print(info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md index dea97921b4..0a0c64fb30 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md @@ -66,9 +66,9 @@ By default, Facebook access tokens have a short lifespan of one hour. To exchang Facebook access token for a long-lived token, update the `.dlt/secrets.toml` with client_id, and client_secret and execute the provided Python code. -```python +```py from facebook_ads import get_long_lived_token -print(get_long_lived_token("your short-lived token") +print(get_long_lived_token("your short-lived token")) ``` Replace the `access_token` in the `.dlt/secrets.toml` file with the long-lived token obtained from @@ -77,7 +77,7 @@ the above code snippet. To retrieve the expiry date and the associated scopes of the token, you can use the following command: -```python +```py from facebook_ads import debug_access_token debug_access_token() ``` @@ -88,7 +88,7 @@ level. In `config.toml` / `secrets.toml`: ```toml [sources.facebook_ads] -access_token_expires_at=1688821881... +access_token_expires_at=1688821881 ``` > Note: The Facebook UI, which is described here, might change. @@ -101,7 +101,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init facebook_ads duckdb ``` @@ -158,16 +158,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python facebook_ads_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `facebook_ads`, you may also @@ -191,7 +191,7 @@ it is important to note the complete list of the default endpoints given in This function returns a list of resources to load campaigns, ad sets, ads, creatives, and ad leads data from Facebook Marketing API. -```python +```py @dlt.source(name="facebook_ads") def facebook_ads_source( account_id: str = dlt.config.value, @@ -200,6 +200,7 @@ def facebook_ads_source( request_timeout: float = 300.0, app_api_version: str = None, ) -> Sequence[DltResource]: + ... ``` `account_id`: Account id associated with add manager, configured in "config.toml". @@ -220,7 +221,7 @@ were issued i.e. 'v17.0'. Defaults to the _facebook_business_ library default ve The ads function fetches ad data. It retrieves ads from a specified account with specific fields and states. -```python +```py @dlt.resource(primary_key="id", write_disposition="replace") def ads( fields: Sequence[str] = DEFAULT_AD_FIELDS, @@ -254,7 +255,7 @@ The default fields are defined in This function returns a list of resources to load facebook_insights. -```python +```py @dlt.source(name="facebook_ads") def facebook_insights_source( account_id: str = dlt.config.value, @@ -271,6 +272,7 @@ def facebook_insights_source( request_timeout: int = 300, app_api_version: str = None, ) -> DltResource: + ... ``` `account_id`: Account id associated with ads manager, configured in _config.toml_. @@ -315,13 +317,14 @@ were issued i.e. 'v17.0'. Defaults to the facebook_business library default vers This function fetches Facebook insights data incrementally from a specified start date until the current date, in day steps. -```python +```py @dlt.resource(primary_key=INSIGHTS_PRIMARY_KEY, write_disposition="merge") def facebook_insights( date_start: dlt.sources.incremental[str] = dlt.sources.incremental( "date_start", initial_value=initial_load_start_date_str ) ) -> Iterator[TDataItems]: + ... ``` `date_start`: Parameter sets the initial value for the "date_start" parameter in @@ -337,7 +340,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="facebook_ads", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -350,7 +353,7 @@ verified source. 1. To load all the data from, campaigns, ad sets, ads, ad creatives and leads. - ```python + ```py load_data = facebook_ads_source() load_info = pipeline.run(load_data) print(load_info) @@ -359,7 +362,7 @@ verified source. 1. To merge the Facebook Ads with the state “DISAPPROVED” and with ads state “PAUSED” you can do the following: - ```python + ```py load_data = facebook_ads_source() # It is recommended to enable root key propagation on a source that is not a merge one by default. this is not required if you always use merge but below we start with replace load_data.root_key = True @@ -382,7 +385,7 @@ verified source. 1. To load data with a custom field, for example, to load only “id” from Facebook ads, you can do the following: - ```python + ```py load_data = facebook_ads_source() # Only loads add ids, works the same for campaigns, leads etc. load_data.ads.bind(fields=("id",)) @@ -395,7 +398,7 @@ verified source. demonstrates how to enrich objects by adding an enrichment transformation that includes additional fields. - ```python + ```py # You can reduce the chunk size for smaller requests load_data = facebook_ads_source(chunk_size=2) @@ -429,7 +432,7 @@ verified source. breakdowns, etc. As defined in the `facebook_insights_source`. This function generates daily reports for a specified number of past days. - ```python + ```py load_data = facebook_insights_source( initial_load_past_days=30, attribution_window_days_lag= 7, diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md index aed19838ef..bf3d23d0a3 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md @@ -81,7 +81,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init filesystem duckdb ``` @@ -150,32 +150,32 @@ For more information, read the 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. Install optional modules: - For AWS S3: - ```bash + ```sh pip install s3fs ``` - For Azure blob: - ```bash + ```sh pip install adlfs>=2023.9.0 ``` - GCS storage: No separate module needed. 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python filesystem_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -197,13 +197,14 @@ This source offers chunked file readers as resources, which can be optionally cu - `read_jsonl()` - `read_parquet()` -```python +```py @dlt.source(_impl_cls=ReadersSource, spec=FilesystemConfigurationResource) def readers( bucket_url: str = dlt.secrets.value, credentials: Union[FileSystemCredentials, AbstractFileSystem] = dlt.secrets.value, file_glob: Optional[str] = "*", ) -> Tuple[DltResource, ...]: + ... ``` - `bucket_url`: The url to the bucket. @@ -225,7 +226,7 @@ This resource lists files in `bucket_url` based on the `file_glob` pattern, retu [FileItem](https://github.com/dlt-hub/dlt/blob/devel/dlt/common/storages/fsspec_filesystem.py#L22) with data access methods. These can be paired with transformers for enhanced processing. -```python +```py @dlt.resource( primary_key="file_url", spec=FilesystemConfigurationResource, standalone=True ) @@ -236,6 +237,7 @@ def filesystem( files_per_page: int = DEFAULT_CHUNK_SIZE, extract_content: bool = False, ) -> Iterator[List[FileItem]]: + ... ``` - `bucket_url`: URL of the bucket. @@ -256,9 +258,9 @@ in bucket URL. To load data into a specific table (instead of the default filesystem table), see the snippet below: -```python +```py @dlt.transformer(standalone=True) -def read_csv(items, chunksize: int = 15) ->: +def read_csv(items, chunksize: int = 15): """Reads csv file with Pandas chunk by chunk.""" ... @@ -275,7 +277,7 @@ Use the [standalone filesystem](../../general-usage/resource#declare-a-standalone-resource) resource to list files in s3, GCS, and Azure buckets. This allows you to customize file readers or manage files using [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html). -```python +```py files = filesystem(bucket_url="s3://my_bucket/data", file_glob="csv_folder/*.csv") pipeline.run(files) ``` @@ -327,7 +329,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="standard_filesystem", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -337,17 +339,17 @@ verified source. 1. To read and load CSV files: - ```python + ```py BUCKET_URL = "YOUR_BUCKET_PATH_HERE" # path of the bucket url or local destination met_files = readers( bucket_url=BUCKET_URL, file_glob="directory/*.csv" - ).read_csv() - # tell dlt to merge on date - met_files.apply_hints(write_disposition="merge", merge_key="date") - # We load the data into the met_csv table - load_info = pipeline.run(met_files.with_name("table_name")) - print(load_info) - print(pipeline.last_trace.last_normalize_info) + ).read_csv() + # tell dlt to merge on date + met_files.apply_hints(write_disposition="merge", merge_key="date") + # We load the data into the met_csv table + load_info = pipeline.run(met_files.with_name("table_name")) + print(load_info) + print(pipeline.last_trace.last_normalize_info) ``` - The `file_glob` parameter targets all CSVs in the "met_csv/A801" directory. @@ -358,7 +360,7 @@ verified source. ::: 1. To load only new CSV files with [incremental loading](../../general-usage/incremental-loading): - ```python + ```py # This configuration will only consider new csv files new_files = filesystem(bucket_url=BUCKET_URL, file_glob="directory/*.csv") # add incremental on modification time @@ -369,7 +371,7 @@ verified source. ``` 1. To read and load Parquet and JSONL from a bucket: - ```python + ```py jsonl_reader = readers(BUCKET_URL, file_glob="**/*.jsonl").read_jsonl( chunksize=10000 ) @@ -391,7 +393,7 @@ verified source. 1. To set up a pipeline that reads from an Excel file using a standalone transformer: - ```python + ```py # Define a standalone transformer to read data from an Excel file. @dlt.transformer(standalone=True) def read_excel( @@ -427,7 +429,7 @@ verified source. 1. To copy files locally, add a step in the filesystem resource and then load the listing to the database: - ```python + ```py def _copy(item: FileItemDict) -> FileItemDict: # instantiate fsspec and copy file dest_file = os.path.join(local_folder, item["file_name"]) @@ -459,7 +461,7 @@ verified source. You can get a fsspec client from filesystem resource after it was extracted i.e. in order to delete processed files etc. The filesystem module contains a convenient method `fsspec_from_resource` that can be used as follows: - ```python + ```py from filesystem import filesystem, fsspec_from_resource # get filesystem source gs_resource = filesystem("gs://ci-test-bucket/") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/github.md b/docs/website/docs/dlt-ecosystem/verified-sources/github.md index 2fd0277500..4c9a322760 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/github.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/github.md @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init github duckdb ``` @@ -110,16 +110,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python github_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `github_reactions`, you may @@ -137,7 +137,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This `dlt.source` function uses GraphQL to fetch DltResource objects: issues and pull requests along with associated reactions, comments, and reactions to comments. -```python +```py @dlt.source def github_reactions( owner: str, @@ -147,6 +147,7 @@ def github_reactions( max_items: int = None, max_item_age_seconds: float = None, ) -> Sequence[DltResource]: + ... ``` `owner`: Refers to the owner of the repository. @@ -169,7 +170,7 @@ yet to be implemented. Defaults to None. The `dlt.resource` function employs the `_get_reactions_data` method to retrieve data about issues, their associated comments, and subsequent reactions. -```python +```py dlt.resource( _get_reactions_data( "issues", @@ -193,11 +194,12 @@ on event type. It loads new events only and appends them to tables. > Note: Github allows retrieving up to 300 events for public repositories, so frequent updates are > recommended for active repos. -```python +```py @dlt.source(max_table_nesting=2) def github_repo_events( owner: str, name: str, access_token: str = None ) -> DltResource: + ... ``` `owner`: Refers to the owner of the repository. @@ -216,13 +218,14 @@ Read more about [nesting levels](../../general-usage/source#reduce-the-nesting-l This `dlt.resource` function serves as the resource for the `github_repo_events` source. It yields repository events as data items. -```python +```py dlt.resource(primary_key="id", table_name=lambda i: i["type"]) # type: ignore def repo_events( last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental( "created_at", initial_value="1970-01-01T00:00:00Z", last_value_func=max ) ) -> Iterator[TDataItems]: + ... ``` `primary_key`: Serves as the primary key, instrumental in preventing data duplication. @@ -244,7 +247,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -258,7 +261,7 @@ verified source. 1. To load all the data from repo on issues, pull requests, their comments and reactions, you can do the following: - ```python + ```py load_data = github_reactions("duckdb", "duckdb") load_info = pipeline.run(load_data) print(load_info) @@ -267,7 +270,7 @@ verified source. 1. To load only the first 100 issues, you can do the following: - ```python + ```py load_data = github_reactions("duckdb", "duckdb", max_items=100) load_info = pipeline.run(load_data.with_resources("issues")) print(load_info) @@ -276,7 +279,7 @@ verified source. 1. You can use fetch and process repo events data incrementally. It loads all data during the first run and incrementally in subsequent runs. - ```python + ```py load_data = github_repo_events( "duckdb", "duckdb", access_token=os.getenv(ACCESS_TOKEN) ) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md index 02d7803a9b..2d8be0b15d 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_analytics.md @@ -12,7 +12,7 @@ or application. This Google Analytics `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/google_analytics_pipeline.py) -loads data using “Google Analytics API” to the destination of your choice. +loads data using the "Google Analytics API" to the destination of your choice. Sources and resources that can be loaded using this verified source are: @@ -29,7 +29,7 @@ Sources and resources that can be loaded using this verified source are: There are two methods to get authenticated for using this verified source: - OAuth credentials -- Service account credential +- Service account credentials Let's go over how to set up both OAuth tokens and service account credentials. In general, OAuth tokens are preferred when user consent is required, while service account credentials are better @@ -39,14 +39,14 @@ requirement. ### Grab Google service account credentials You need to create a GCP service account to get API credentials if you don't have one. To create - one, follow these steps: +one, follow these steps: 1. Sign in to [console.cloud.google.com](http://console.cloud.google.com/). 1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#creating) if needed. -1. Enable "Google Analytics API", refer +1. Enable the "Google Analytics API". Refer to the [Google documentation](https://support.google.com/googleapi/answer/6158841?hl=en) for comprehensive instructions on this process. @@ -58,7 +58,7 @@ You need to create a GCP service account to get API credentials if you don't hav 1. Create a new JSON key by selecting "Manage Keys" > "ADD KEY" > "CREATE". 1. You can download the ".json" file containing the necessary credentials for future use. -### Grab google OAuth credentials +### Grab Google OAuth credentials You need to create a GCP account to get OAuth credentials if you don't have one. To create one, follow these steps: @@ -69,31 +69,31 @@ follow these steps: 1. Enable the Analytics API in the project. -1. Search credentials in the search bar and go to Credentials. +1. Search for credentials in the search bar and go to Credentials. 1. Go to Credentials -> OAuth client ID -> Select Desktop App from the Application type and give an appropriate name. -1. Download the credentials and fill "client_id", "client_secret" and "project_id" in +1. Download the credentials and fill in "client_id", "client_secret", and "project_id" in "secrets.toml". 1. Go back to credentials and select the OAuth consent screen on the left. -1. Fill in the App name, user support email(your email), authorized domain (localhost.com), and dev +1. Fill in the App name, user support email (your email), authorized domain (localhost.com), and dev contact info (your email again). 1. Add the following scope: - ``` + ```text "https://www.googleapis.com/auth/analytics.readonly" ``` 1. Add your email as a test user. -After configuring "client_id", "client_secret" and "project_id" in "secrets.toml". To generate the +After configuring "client_id", "client_secret", and "project_id" in "secrets.toml", to generate the refresh token, run the following script from the root folder: -```bash +```sh python google_analytics/setup_script_gcp_oauth.py ``` @@ -128,7 +128,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init google_analytics duckdb ``` @@ -214,16 +214,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python google_analytics_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is @@ -239,9 +239,9 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug ### Source `simple_load` This function returns a list of resources including metadata, metrics, and dimensions data from -Google Analytics API. +the Google Analytics API. -```python +```py @dlt.source(max_table_nesting=2) def google_analytics( credentials: Union[ GcpOAuthCredentials, GcpServiceAccountCredential ] = dlt.secrets.value, @@ -250,6 +250,7 @@ def google_analytics( start_date: Optional[str] = START_DATE, rows_per_page: int = 1000, ) -> List[DltResource]: + ... ``` `credentials`: GCP OAuth or service account credentials. @@ -269,9 +270,10 @@ set to 1000. This function retrieves all the metrics and dimensions for a report from a Google Analytics project. -```python +```py @dlt.resource(selected=False) def get_metadata(client: Resource, property_id: int) -> Iterator[Metadata]: + ... ``` `client`: This is the Google Analytics client used to make requests. @@ -284,7 +286,7 @@ def get_metadata(client: Resource, property_id: int) -> Iterator[Metadata]: This transformer function extracts data using metadata and populates a table called "metrics" with the data from each metric. -```python +```py @dlt.transformer(data_from=get_metadata, write_disposition="replace", name="metrics") def metrics_table(metadata: Metadata) -> Iterator[TDataItem]: for metric in metadata.metrics: @@ -293,7 +295,7 @@ def metrics_table(metadata: Metadata) -> Iterator[TDataItem]: `metadata`: GA4 metadata is stored in this "Metadata" class object. -Similarly, there is a transformer function called `dimensions_table` that populates table called +Similarly, there is a transformer function called `dimensions_table` that populates a table called "dimensions" with the data from each dimension. ## Customization @@ -304,7 +306,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="google_analytics", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -317,7 +319,7 @@ verified source. 1. To load all the data from metrics and dimensions: - ```python + ```py load_data = google_analytics() load_info = pipeline.run(load_data) print(load_info) @@ -328,9 +330,9 @@ verified source. 1. To load data from a specific start date: - ```python + ```py load_data = google_analytics(start_date='2023-01-01') - load_info = pipeline.run(load_data). + load_info = pipeline.run(load_data) print(load_info) ``` @@ -349,4 +351,4 @@ verified source. - [Load data from Google Analytics to Databricks in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-databricks) - [Load data from Google Analytics to PostgreSQL in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-postgres) - [Load data from Google Analytics to AWS Athena in python with dlt](https://dlthub.com/docs/pipelines/google_analytics/load-data-with-python-from-google_analytics-to-athena) - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md index 2a5d4b03ab..be12f5aea4 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md @@ -87,7 +87,7 @@ follow these steps: 1. Add the following scope: - ``` + ```text "https://www.googleapis.com/auth/spreadsheets.readonly" ``` @@ -98,7 +98,7 @@ follow these steps: After configuring "client_id", "client_secret" and "project_id" in "secrets.toml". To generate the refresh token, run the following script from the root folder: - ```bash + ```sh python google_sheets/setup_script_gcp_oauth.py ``` @@ -128,13 +128,13 @@ following: When setting up the pipeline, you can use either the browser-copied URL of your spreadsheet: -```bash +```sh https://docs.google.com/spreadsheets/d/1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4/edit?usp=sharing ``` or spreadsheet id (which is a part of the url) -```bash +```sh 1VTtCiYgxjAwcIw7UM1_BSaxC3rzIpr0HwXZwd2OlPD4 ``` @@ -183,7 +183,7 @@ converted into tables, named after them and stored in the destination. 1. In range_names, you can enter as follows: - ``` + ```text range_names = ["Range_1","Range_2","Sheet1!A1:D10"] ``` @@ -214,7 +214,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init google_sheets duckdb ``` @@ -296,20 +296,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python google_sheets_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -328,7 +328,7 @@ Also, since recently `dlt`'s no longer recognizing date and time types, so you h Use the `apply_hints` method on the resource to achieve this. Here's how you can do it: -```python +```py for resource in resources: resource.apply_hints(columns={ "total_amount": {"data_type": "double"}, @@ -340,7 +340,7 @@ This will ensure that all values in the `total_amount` column are treated as `do And `date` column will be represented as dates, not integers. For a single resource (e.g. `Sheet1`), you can simply use: -```python +```py source.Sheet1.apply_hints(columns={ "total_amount": {"data_type": "double"}, "date": {"data_type": "timestamp"}, @@ -348,7 +348,7 @@ source.Sheet1.apply_hints(columns={ ``` To get the name of resources, you can use: -```python +```py print(source.resources.keys()) ``` @@ -371,7 +371,7 @@ or set `full_refresh=True`. This function loads data from a Google Spreadsheet. It retrieves data from all specified ranges, whether explicitly defined or named, and obtains metadata for the first two rows within each range. -```python +```py def google_spreadsheet( spreadsheet_url_or_id: str = dlt.config.value, range_names: Sequence[str] = dlt.config.value, @@ -381,6 +381,7 @@ def google_spreadsheet( get_sheets: bool = False, get_named_ranges: bool = True, ) -> Iterable[DltResource]: + ... ``` `spreadsheet_url_or_id`: ID or URL of the Google Spreadsheet. @@ -399,7 +400,7 @@ def google_spreadsheet( This function processes each range name provided by the source function, loading its data into separate tables in the destination. -```python +```py dlt.resource( process_range(rows_data, headers=headers, data_types=data_types), name=name, @@ -429,7 +430,7 @@ This table refreshes after each load, storing information on loaded ranges: - Range name as given to the source. - String and parsed representation of the loaded range. -```python +```py dlt.resource( metadata_table, write_disposition="merge", @@ -457,7 +458,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="google_sheets", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -467,7 +468,7 @@ verified source. 1. To load data from explicit range names: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL range_names=["range_name1", "range_name2"], # Range names @@ -483,7 +484,7 @@ verified source. 1. To load all the range_names from spreadsheet: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL get_sheets=False, @@ -497,7 +498,7 @@ verified source. 1. To load all the sheets from spreadsheet: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL get_sheets=True, @@ -511,7 +512,7 @@ verified source. 1. To load all the sheets and range_names: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL get_sheets=True, @@ -525,7 +526,7 @@ verified source. 1. To load data from multiple spreadsheets: - ```python + ```py load_data1 = google_spreadsheet( "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL range_names=["Sheet 1!A1:B10"], @@ -543,7 +544,7 @@ verified source. 1. To load with table rename: - ```python + ```py load_data = google_spreadsheet( "https://docs.google.com/spreadsheets/d/43lkHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580/edit#gid=0", #Spreadsheet URL range_names=["Sheet 1!A1:B10"], @@ -554,7 +555,6 @@ verified source. load_info = pipeline.run(load_data) print(load_info) - } ``` ### Using Airflow with Google Spreadsheets: @@ -583,7 +583,7 @@ Below is the correct way to set up an Airflow DAG for this purpose: - When adding the Google Spreadsheet task to the pipeline, avoid decomposing it; run it as a single task for efficiency. -```python +```py @dag( schedule_interval='@daily', start_date=pendulum.datetime(2023, 2, 1), diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md index 3a623c7b49..8a6e1d1bb3 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md @@ -55,7 +55,7 @@ Follow these steps: - Read scopes for CMS, CRM, and Settings. - Permissions for: - ``` + ```text business-intelligence, actions, crm.export, e-commerce, oauth, tickets ``` @@ -74,7 +74,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init hubspot duckdb ``` @@ -115,16 +115,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python hubspot_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `hubspot_pipeline`, you may @@ -148,12 +148,13 @@ it is important to note the complete list of the default endpoints given in This function returns a list of resources to load companies, contacts, deals, tickets, products, and web analytics events data into the destination. -```python +```py @dlt.source(name="hubspot") def hubspot( api_key: str = dlt.secrets.value, include_history: bool = False, ) -> Sequence[DltResource]: + ... ``` `api_key`: The key used to authenticate with the HubSpot API. Configured in "secrets.toml". @@ -166,7 +167,7 @@ specified entities. This resource function fetches data from the "companies" endpoint and loads it to the destination, replacing any existing data. -```python +```py @dlt.resource(name="companies", write_disposition="replace") def companies( api_key: str = api_key, @@ -195,7 +196,7 @@ in addition to the custom properties. Similar to this, resource functions "conta This function loads web analytics events for specific objects from Hubspot API into the destination. -```python +```py @dlt.resource def hubspot_events_for_objects( object_type: THubspotObjectType, @@ -203,6 +204,7 @@ def hubspot_events_for_objects( api_key: str = dlt.secrets.value, start_date: pendulum.DateTime = STARTDATE, ) -> DltResource: + ... ``` `object_type`: One of the Hubspot object types as defined in @@ -225,7 +227,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="hubspot", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -238,7 +240,7 @@ verified source. 1. To load all the data from contacts, companies, deals, products, tickets, and quotes into the destination. - ```python + ```py load_data = hubspot() load_info = pipeline.run(load_data) print(load_info) @@ -246,7 +248,7 @@ verified source. 1. To load data from contacts and companies, with time history using "with_resources" method. - ```python + ```py load_data = hubspot(include_history=True).with_resources("companies","contacts") load_info = pipeline.run(load_data) print(load_info) @@ -256,7 +258,7 @@ verified source. 1. By default, all the custom properties of a CRM object are extracted. If you want only particular fields, set the flag `include_custom_props=False` and add a list of properties with the `props` arg. - ```python + ```py load_data = hubspot() load_data.contacts.bind(props=["date_of_birth", "degree"], include_custom_props=False) load_info = pipeline.run(load_data.with_resources("contacts")) @@ -264,7 +266,7 @@ verified source. 1. If you want to read all the custom properties of CRM objects and some additional (e.g. Hubspot driven) properties. - ```python + ```py load_data = hubspot() load_data.contacts.bind(props=["hs_content_membership_email", "hs_content_membership_email_confirmed"]) load_info = pipeline.run(load_data.with_resources("contacts")) @@ -273,7 +275,7 @@ verified source. 1. To load the web analytics events of a given object type. - ```python + ```py resource = hubspot_events_for_objects("company", ["7086461639", "7086464459"]) # Here, object type : company, and object ids : 7086461639 and 7086464459 load_info = pipeline.run([resource]) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md index 2aa1d1130f..668d1ec470 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/inbox.md @@ -9,14 +9,14 @@ keywords: [inbox, inbox verified source, inbox mail, email] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: This source collects inbox emails, retrieves attachments, and stores relevant email data. It uses the imaplib library for IMAP interactions and the dlt library for data processing. This Inbox `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/inbox_pipeline.py) -loads data using “Inbox” verified source to the destination of your choice. +load data using the “Inbox” verified source to the destination of your choice. Sources and resources that can be loaded using this verified source are: @@ -36,14 +36,14 @@ Sources and resources that can be loaded using this verified source are: - "email_account": Associated email account name (e.g. dlthub@dlthub.com). - "password": APP password (for third-party clients) from the email provider. -1. Host addresses and APP password procedures vary by provider and can be found via a quick Google search. For Google Mail's app password, read [here](https://support.google.com/mail/answer/185833?hl=en#:~:text=An%20app%20password%20is%20a,2%2DStep%20Verification%20turned%20on). +2. Host addresses and APP password procedures vary by provider and can be found via a quick Google search. For Google Mail's app password, read [here](https://support.google.com/mail/answer/185833?hl=en#:~:text=An%20app%20password%20is%20a,2%2DStep%20Verification%20turned%20on). -1. However, this guide covers Gmail inbox configuration; similar steps apply to other providers. +3. However, this guide covers Gmail inbox configuration; similar steps apply to other providers. ### Accessing Gmail Inbox 1. SMTP server DNS: 'imap.gmail.com' for Gmail. -1. Port: 993 (for internet messaging access protocol over TLS/SSL). +2. Port: 993 (for internet messaging access protocol over TLS/SSL). ### Grab App password for Gmail @@ -52,12 +52,12 @@ Sources and resources that can be loaded using this verified source are: #### Steps to Create and Use App Passwords: 1. Visit your Google Account > Security. -1. Under "How you sign in to Google", enable 2-Step Verification. -1. Choose App passwords at the bottom. -1. Name the device for reference. -1. Click Generate. -1. Input the generated 16-character app password as prompted. -1. Click Done. +2. Under "How you sign in to Google", enable 2-Step Verification. +3. Choose App passwords at the bottom. +4. Name the device for reference. +5. Click Generate. +6. Input the generated 16-character app password as prompted. +7. Click Done. Read more in [this article](https://pythoncircle.com/post/727/accessing-gmail-inbox-using-python-imaplib-module/) or [Google official documentation.](https://support.google.com/mail/answer/185833#zippy=%2Cwhy-you-may-need-an-app-password) @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init inbox duckdb ``` @@ -76,10 +76,10 @@ To get started with your data pipeline, follow these steps: with Inbox as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). -1. If you'd like to use a different destination, simply replace `duckdb` with the name of your +2. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../destinations). -1. After running this command, a new directory will be created with the necessary files and +3. After running this command, a new directory will be created with the necessary files and configuration settings to get started. For more information, read the @@ -100,11 +100,11 @@ For more information, read the password = "Please set me up!" # # APP Password for the above email account. ``` -1. Replace the host, email and password value with the [previously copied one](#grab-credentials) +2. Replace the host, email, and password value with the [previously copied one](#grab-credentials) to ensure secure access to your Inbox resources. > When adding the App Password, remove any spaces. For instance, "abcd efgh ijkl mnop" should be "abcdefghijklmnop". -1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to +3. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to add credentials for your chosen destination, ensuring proper routing of your data to the final destination. @@ -112,7 +112,7 @@ For more information, read the 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` @@ -126,9 +126,9 @@ For more information, read the For pdf parsing: - PyPDF2: `pip install PyPDF2` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +2. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `standard_inbox`, you may also @@ -145,7 +145,7 @@ For more information, read the [Walkthrough: Run a pipeline.](../../walkthroughs This function fetches inbox emails, saves attachments locally, and returns uids, messages, and attachments as resources. -```python +```py @dlt.source def inbox_source( host: str = dlt.secrets.value, @@ -158,6 +158,7 @@ def inbox_source( filter_by_mime_type: Sequence[str] = None, chunksize: int = DEFAULT_CHUNK_SIZE, ) -> Sequence[DltResource]: + ... ``` `host` : IMAP server hostname. Default: 'dlt.secrets.value'. @@ -182,13 +183,14 @@ def inbox_source( This resource collects email message UIDs (Unique IDs) from the mailbox. -```python +```py @dlt.resource(name="uids") def get_messages_uids( initial_message_num: Optional[ dlt.sources.incremental[int] ] = dlt.sources.incremental("message_uid", initial_value=1), ) -> TDataItem: + ... ``` `initial_message_num`: provides incremental loading on UID. @@ -197,12 +199,13 @@ def get_messages_uids( This resource retrieves emails by UID (Unique IDs), yielding a dictionary with metadata like UID, ID, sender, subject, dates, content type, and body. -```python +```py @dlt.transformer(name="messages", primary_key="message_uid") def get_messages( items: TDataItems, include_body: bool = True, ) -> TDataItem: + ... ``` `items`: An iterable containing dictionaries with 'message_uid' representing the email message UIDs. @@ -214,7 +217,7 @@ def get_messages( Similar to the previous resources, resource `get_attachments` extracts email attachments by UID from the IMAP server. It yields file items with attachments in the file_content field and the original email in the message field. -```python +```py @dlt.transformer( name="attachments", primary_key="file_hash", @@ -222,6 +225,7 @@ It yields file items with attachments in the file_content field and the original def get_attachments( items: TDataItems, ) -> Iterable[List[FileItem]]: + ... ``` `items`: An iterable containing dictionaries with 'message_uid' representing the email message UIDs. @@ -236,7 +240,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="standard_inbox", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -246,11 +250,11 @@ verified source. To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). -1. To load messages from "mycreditcard@bank.com" starting "2023-10-1": +2. To load messages from "mycreditcard@bank.com" starting "2023-10-1": - Set `DEFAULT_START_DATE = pendulum.datetime(2023, 10, 1)` in `./inbox/settings.py`. - Use the following code: - ```python + ```py # Retrieve messages from the specified email address. messages = inbox_source(filter_emails=("mycreditcard@bank.com",)).messages # Configure messages to exclude body and name the result "my_inbox". @@ -261,18 +265,18 @@ verified source. print(load_info) ``` > Please refer to inbox_source() docstring for email filtering options by sender, date, or mime type. -1. To load messages from multiple emails, including "community@dlthub.com": +3. To load messages from multiple emails, including "community@dlthub.com": - ```python + ```py messages = inbox_source( filter_emails=("mycreditcard@bank.com", "community@dlthub.com.") ).messages ``` -1. In `inbox_pipeline.py`, the `pdf_to_text` transformer extracts text from PDFs, treating each page as a separate data item. +4. In `inbox_pipeline.py`, the `pdf_to_text` transformer extracts text from PDFs, treating each page as a separate data item. Using the `pdf_to_text` function to load parsed pdfs from mail to the database: - ```python + ```py filter_emails = ["mycreditcard@bank.com", "community@dlthub.com."] # Email senders attachments = inbox_source( filter_emails=filter_emails, filter_by_mime_type=["application/pdf"] @@ -285,4 +289,4 @@ verified source. ``` - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md index 4588f4f4c6..068251a927 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/jira.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/jira.md @@ -3,7 +3,7 @@ :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Jira](https://www.atlassian.com/software/jira) by Atlassian helps teams manage projects and tasks @@ -11,16 +11,16 @@ efficiently, prioritize work, and collaborate. This Jira `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/jira_pipeline.py) -loads data using Jira API to the destination of your choice. +loads data using the Jira API to the destination of your choice. The endpoints that this verified source supports are: | Name | Description | | --------- | ---------------------------------------------------------------------------------------- | -| issues | individual pieces of work to be completed | -| users | administrator of a given project | -| workflows | the key aspect of managing and tracking the progress of issues or tasks within a project | -| projects | a collection of tasks that need to be completed to achieve a certain outcome | +| issues | Individual pieces of work to be completed | +| users | Administrators of a given project | +| workflows | The key aspect of managing and tracking the progress of issues or tasks within a project | +| projects | A collection of tasks that need to be completed to achieve a certain outcome | To get a complete list of sub-endpoints that can be loaded, see [jira/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/jira/settings.py) @@ -51,7 +51,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init jira duckdb ``` @@ -96,25 +96,25 @@ For more information, read the guide on [how to add a verified source](../../wal add credentials for your chosen destination, ensuring proper routing of your data to the final destination. -For more information, read the [General Usage: Credentials.](../../general-usage/credentials) +For more information, read [General Usage: Credentials.](../../general-usage/credentials) ## Run the pipeline 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python jira_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`, you may also + For example, the `pipeline_name` for the above pipeline example is `jira_pipeline`. You may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). @@ -134,41 +134,43 @@ it is important to note the complete list of the default endpoints given in This source function creates a list of resources to load data into the destination. -```python +```py @dlt.source def jira( subdomain: str = dlt.secrets.value, email: str = dlt.secrets.value, api_token: str = dlt.secrets.value, ) -> Iterable[DltResource]: + ... ``` - `subdomain`: The subdomain of the Jira account. Configured in ".dlt/secrets.toml". - `email`: The email associated with the Jira account. Configured in ".dlt/secrets.toml". -- `api_token`: The API token for accessing the Jira account.Configured in ".dlt/secrets.toml". +- `api_token`: The API token for accessing the Jira account. Configured in ".dlt/secrets.toml". ### Source `jira_search` This function returns a resource for querying issues using JQL [(Jira Query Language)](https://support.atlassian.com/jira-service-management-cloud/docs/use-advanced-search-with-jira-query-language-jql/). -```python +```py @dlt.source def jira_search( subdomain: str = dlt.secrets.value, email: str = dlt.secrets.value, api_token: str = dlt.secrets.value, ) -> Iterable[DltResource]: + ... ``` -The above function uses the same arguments `subdomain`, `email` and `api_token` as described above -for [jira source](jira.md#source-jira). +The above function uses the same arguments `subdomain`, `email`, and `api_token` as described above +for the [jira source](jira.md#source-jira). ### Resource `issues` The resource function searches issues using JQL queries and then loads them to the destination. -```python +```py @dlt.resource(write_disposition="replace") def issues(jql_queries: List[str]) -> Iterable[TDataItem]: api_path = "rest/api/3/search" @@ -179,14 +181,14 @@ def issues(jql_queries: List[str]) -> Iterable[TDataItem]: ## Customization ### Create your own pipeline -If you wish to create your own pipelines you can leverage source and resource methods as discussed +If you wish to create your own pipelines, you can leverage source and resource methods as discussed above. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset. To read more about pipeline configuration, please refer to our documentation [here](https://dlthub.com/docs/general-usage/pipeline): - ```python + ```py pipeline = dlt.pipeline( pipeline_name="jira_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -196,20 +198,20 @@ above. 2. To load custom endpoints such as “issues” and “users” using the jira source function: - ```python + ```py #Run the pipeline load_info = pipeline.run(jira().with_resources("issues","users")) print(f"Load Information: {load_info}") ``` -3. To load the custom issues using JQL queries, you can use custom queries, here is an example +3. To load the custom issues using JQL queries, you can use custom queries. Here is an example below: - ```python + ```py # Define the JQL queries as follows queries = [ "created >= -30d order by created DESC", - "created >= -30d AND project = DEV AND issuetype = Epic AND status = "In Progress" order by created DESC", + 'created >= -30d AND project = DEV AND issuetype = Epic AND status = "In Progress" order by created DESC', ] # Run the pipeline load_info = pipeline.run(jira_search().issues(jql_queries=queries)) @@ -218,4 +220,4 @@ above. ``` - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md index 694a81ba1f..0cedad6645 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/kafka.md @@ -9,13 +9,13 @@ keywords: [kafka api, kafka verified source, kafka] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://join.slack.com/t/dlthub-community/shared_invite/zt-1n5193dbq-rCBmJ6p~ckpSFK4hCF2dYA) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Kafka](https://www.confluent.io/) is an open-source distributed event streaming platform, organized in the form of a log with message publishers and subscribers. -The Kafka `dlt` verified source loads data using Confluent Kafka API to the destination of your choice, -see a [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/kafka_pipeline.py). +The Kafka `dlt` verified source loads data using the Confluent Kafka API to the destination of your choice. +See a [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/kafka_pipeline.py). The resource that can be loaded: @@ -29,7 +29,7 @@ The resource that can be loaded: 1. Follow the [Kafka Setup](https://developer.confluent.io/get-started/python/#kafka-setup) to tweak a project. -1. Follow the [Configuration](https://developer.confluent.io/get-started/python/#configuration) to +2. Follow the [Configuration](https://developer.confluent.io/get-started/python/#configuration) to get the project credentials. ### Initialize the verified source @@ -38,7 +38,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init kafka duckdb ``` @@ -47,10 +47,10 @@ To get started with your data pipeline, follow these steps: with Kafka as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). -1. If you'd like to use a different destination, simply replace `duckdb` with the name of your +2. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../destinations). -1. After running this command, a new directory will be created with the necessary files and +3. After running this command, a new directory will be created with the necessary files and configuration settings to get started. For more information, read the @@ -80,25 +80,27 @@ sasl_password="example_secret" 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` -1. You're now ready to run the pipeline! To get started, run the following command: +2. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python kafka_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/run-a-pipeline). +:::info If you created a topic and start reading from it immedately, the brokers may be not yet synchronized and offset from which `dlt` reads messages may become invalid. In this case the resource will return no messages. Pending messages will be received on next run (or when brokers synchronize) + ## Sources and resources `dlt` works on the principle of [sources](../../general-usage/source) and @@ -108,7 +110,7 @@ For more information, read the [Walkthrough: Run a pipeline](../../walkthroughs/ This function retrieves messages from the given Kafka topics. -```python +```py @dlt.resource(name="kafka_messages", table_name=lambda msg: msg["_kafka"]["topic"], standalone=True) def kafka_consumer( topics: Union[str, List[str]], @@ -118,29 +120,30 @@ def kafka_consumer( batch_timeout: Optional[int] = 3, start_from: Optional[TAnyDateTime] = None, ) -> Iterable[TDataItem]: + ... ``` `topics`: A list of Kafka topics to be extracted. -`credentials`: By default, is initialized with the data from -the `secrets.toml`. May be used explicitly to pass an initialized +`credentials`: By default, it is initialized with the data from +the `secrets.toml`. It may be used explicitly to pass an initialized Kafka Consumer object. -`msg_processor`: A function, which'll be used to process every message +`msg_processor`: A function, which will be used to process every message read from the given topics before saving them in the destination. -Can be used explicitly to pass a custom processor. See the +It can be used explicitly to pass a custom processor. See the [default processor](https://github.com/dlt-hub/verified-sources/blob/fe8ed7abd965d9a0ca76d100551e7b64a0b95744/sources/kafka/helpers.py#L14-L50) as an example of how to implement processors. -`batch_size`: The amount of messages to extract from the cluster -at once. Can be set to tweak performance. +`batch_size`: The number of messages to extract from the cluster +at once. It can be set to tweak performance. -`batch_timeout`: The maximum timeout for a single batch reading -operation. Can be set to tweak performance. +`batch_timeout`: The maximum timeout (in seconds) for a single batch reading +operation. It can be set to tweak performance. -`start_from`: A timestamp, starting with which the messages must +`start_from`: A timestamp, starting from which the messages must be read. When passed, `dlt` asks the Kafka cluster for an offset, -actual for the given timestamp, and starts to read messages from +which is actual for the given timestamp, and starts to read messages from this offset. @@ -151,7 +154,7 @@ this offset. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="kafka", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -159,18 +162,18 @@ this offset. ) ``` -1. To extract several topics: +2. To extract several topics: - ```python + ```py topics = ["topic1", "topic2", "topic3"] source = kafka_consumer(topics) pipeline.run(source, write_disposition="replace") ``` -1. To extract messages and process them in a custom way: +3. To extract messages and process them in a custom way: - ```python + ```py def custom_msg_processor(msg: confluent_kafka.Message) -> Dict[str, Any]: return { "_kafka": { @@ -185,12 +188,12 @@ this offset. pipeline.run(data) ``` -1. To extract messages, starting from a timestamp: +4. To extract messages, starting from a timestamp: - ```python + ```py data = kafka_consumer("topic", start_from=pendulum.datetime(2023, 12, 15)) pipeline.run(data) ``` - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md index 45841850c6..8be748b1a3 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/matomo.md @@ -44,7 +44,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init matomo duckdb ``` @@ -102,16 +102,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python matomo_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `matomo`, you may also @@ -128,7 +128,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function executes and loads a set of reports defined in "queries" for a specific Matomo site identified by "site_id". -```python +```py @dlt.source(max_table_nesting=2) def matomo_reports( api_token: str = dlt.secrets.value, @@ -136,6 +136,7 @@ def matomo_reports( queries: List[DictStrAny] = dlt.config.value, site_id: int = dlt.config.value, ) -> Iterable[DltResource]: + ... ``` `api_token`: API access token for Matomo server authentication, defaults to "./dlt/secrets.toml" @@ -152,7 +153,7 @@ def matomo_reports( The function loads visits from current day and the past `initial_load_past_days` in first run. In subsequent runs it continues from last load and skips active visits until closed. -```python +```py def matomo_visits( api_token: str = dlt.secrets.value, url: str = dlt.config.value, @@ -162,6 +163,7 @@ def matomo_visits( visit_max_duration_seconds: int = 3600, get_live_event_visitors: bool = False, ) -> List[DltResource]: + ... ``` `api_token`: API token for authentication, defaulting to "./dlt/secrets.toml". @@ -184,7 +186,7 @@ def matomo_visits( This function retrieves site visits within a specified timeframe. If a start date is given, it begins from that date. If not, it retrieves all visits up until now. -```python +```py @dlt.resource( name="visits", write_disposition="append", primary_key="idVisit", selected=True ) @@ -196,6 +198,7 @@ def get_last_visits( visit_max_duration_seconds: int = 3600, rows_per_page: int = 2000, ) -> Iterator[TDataItem]: + ... ``` `site_id`: Unique ID for each Matomo site. @@ -215,7 +218,7 @@ def get_last_visits( This function, retrieves unique visit information from get_last_visits. -```python +```py @dlt.transformer( data_from=get_last_visits, write_disposition="merge", @@ -225,6 +228,7 @@ This function, retrieves unique visit information from get_last_visits. def get_unique_visitors( visits: List[DictStrAny], client: MatomoAPIClient, site_id: int ) -> Iterator[TDataItem]: + ... ``` `visits`: Recent visit data within the specified timeframe. @@ -242,7 +246,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="matomo", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -255,7 +259,7 @@ verified source. 1. To load the data from reports. - ```python + ```py data_reports = matomo_reports() load_info = pipeline_reports.run(data_reports) print(load_info) @@ -264,7 +268,7 @@ verified source. 1. To load custom data from reports using queries. - ```python + ```py queries = [ { "resource_name": "custom_report_name", @@ -285,7 +289,7 @@ verified source. 1. To load data from reports and visits. - ```python + ```py data_reports = matomo_reports() data_events = matomo_visits() load_info = pipeline_reports.run([data_reports, data_events]) @@ -294,7 +298,7 @@ verified source. 1. To load data on live visits and visitors, and only retrieve data from today. - ```python + ```py load_data = matomo_visits(initial_load_past_days=1, get_live_event_visitors=True) load_info = pipeline_events.run(load_data) print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md index 9178d2ab6d..a30eb3f248 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md @@ -66,30 +66,30 @@ Here are the typical ways to configure MongoDB and their connection URLs: 1. Connect to MongoDB: - ```bash + ```sh mongo "mongodb://dbuser:passwd@your_host:27017" ``` 1. List all Databases: - ```bash + ```sh show dbs ``` 1. View Collections in a Database: 1. Switch to Database: - ```bash + ```sh use your_database_name ``` 1. Display its Collections: - ```bash + ```sh show collections ``` 1. Disconnect: - ```bash + ```sh exit ``` @@ -115,7 +115,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init mongodb duckdb ``` @@ -174,16 +174,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python mongodb_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `local_mongo`, you may also @@ -200,7 +200,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function loads data from a MongoDB database, yielding one or multiple collections to be retrieved. -```python +```py @dlt.source def mongodb( connection_url: str = dlt.secrets.value, @@ -209,6 +209,7 @@ def mongodb( incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg] write_disposition: Optional[str] = dlt.config.value, ) -> Iterable[DltResource]: + ... ``` `connection_url`: MongoDB connection URL. @@ -226,7 +227,7 @@ def mongodb( This function fetches a single collection from a MongoDB database using PyMongo. -```python +```py def mongodb_collection( connection_url: str = dlt.secrets.value, database: Optional[str] = dlt.config.value, @@ -234,6 +235,7 @@ def mongodb_collection( incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg] write_disposition: Optional[str] = dlt.config.value, ) -> Any: + ... ``` `collection`: Name of the collection to load. @@ -247,7 +249,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="mongodb_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -257,7 +259,7 @@ verified source. 1. To load all the collections in a database: - ```python + ```py load_data = mongodb() load_info = pipeline.run(load_data, write_disposition="replace") print(load_info) @@ -265,7 +267,7 @@ verified source. 1. To load a specific collections from the database: - ```python + ```py load_data = mongodb().with_resources("collection_1", "collection_2") load_info = pipeline.run(load_data, write_disposition="replace") print(load_info) @@ -273,7 +275,7 @@ verified source. 1. To load specific collections from the source incrementally: - ```python + ```py load_data = mongodb(incremental=dlt.sources.incremental("date")).with_resources("collection_1") load_info = pipeline.run(load_data, write_disposition = "merge") print(load_info) @@ -282,7 +284,7 @@ verified source. 1. To load data from a particular collection say "movies" incrementally: - ```python + ```py load_data = mongodb_collection( collection="movies", incremental=dlt.sources.incremental( @@ -300,7 +302,7 @@ verified source. 1. To incrementally load a table with an append-only disposition using hints: - ```python + ```py # Suitable for tables where new rows are added, but existing rows aren't updated. # Load data from the 'listingsAndReviews' collection in MongoDB, using 'last_scraped' for incremental addition. airbnb = mongodb().with_resources("listingsAndReviews") @@ -317,7 +319,7 @@ verified source. 1. To load a selected collection and rename it in the destination: - ```python + ```py # Create the MongoDB source and select the "collection_1" collection source = mongodb().with_resources("collection_1") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md index a713121f29..338611e657 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md @@ -46,7 +46,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init mux duckdb ``` @@ -88,16 +88,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python mux_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is @@ -115,7 +115,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function yields resources "asset_resource" and "views_resource" to load video assets and views. -```python +```py @dlt.source def mux_source() -> Iterable[DltResource]: yield assets_resource @@ -126,13 +126,14 @@ def mux_source() -> Iterable[DltResource]: The assets_resource function fetches metadata about video assets from the Mux API's "assets" endpoint. -```python +```py @dlt.resource(write_disposition="merge") def assets_resource( mux_api_access_token: str = dlt.secrets.value, mux_api_secret_key: str = dlt.secrets.value, limit: int = DEFAULT_LIMIT, ) -> Iterable[TDataItem]: + ... ``` `mux_api_access_token`: Mux API token for authentication, defaults to ".dlt/secrets.toml". @@ -145,13 +146,14 @@ def assets_resource( This function yields data about every video view from yesterday to be loaded. -```python +```py @dlt.resource(write_disposition="append") def views_resource( mux_api_access_token: str = dlt.secrets.value, mux_api_secret_key: str = dlt.secrets.value, limit: int = DEFAULT_LIMIT, ) -> Iterable[DltResource]: + ... ``` The arguments `mux_api_access_token`, `mux_api_secret_key` and `limit` are the same as described [above](#resource-assets_resource) in "asset_resource". @@ -165,7 +167,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="mux_pipeline", # Use a custom name if desired destination="bigquery", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -175,21 +177,21 @@ verified source. 1. To load metadata about every asset to be loaded: - ```python - load_info = pipeline.run(mux_source().with_resources("assets_resource") + ```py + load_info = pipeline.run(mux_source().with_resources("assets_resource")) print(load_info) ``` 1. To load data for each video view from yesterday: - ```python - load_info = pipeline.run(mux_source().with_resources("views_resource") + ```py + load_info = pipeline.run(mux_source().with_resources("views_resource")) print(load_info) ``` 1. To load both metadata about assets and video views from yesterday: - ```python + ```py load_info = pipeline.run(mux_source()) print(load_info) ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md index ffb0becfbb..650fc10fde 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/notion.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/notion.md @@ -50,7 +50,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init notion duckdb ``` @@ -93,16 +93,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python notion_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `notion`, you may also use any @@ -119,12 +119,13 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function loads notion databases from notion into the destination. -```python +```py @dlt.source def notion_databases( database_ids: Optional[List[Dict[str, str]]] = None, api_key: str = dlt.secrets.value, ) -> Iterator[DltResource]: + ... ``` `database_ids`: A list of dictionaries each containing a database id and a name. @@ -146,7 +147,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="notion", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -159,7 +160,7 @@ verified source. 1. To load all the integrated databases: - ```python + ```py load_data = notion_databases() load_info = pipeline.run(load_data) print(load_info) @@ -167,7 +168,7 @@ verified source. 1. To load the custom databases: - ```python + ```py selected_database_ids = [{"id": "0517dae9409845cba7d","use_name":"db_one"}, {"id": "d8ee2d159ac34cfc"}] load_data = notion_databases(database_ids=selected_database_ids) load_info = pipeline.run(load_data) @@ -176,7 +177,7 @@ verified source. The Database ID can be retrieved from the URL. For example if the URL is: - ```shell + ```sh https://www.notion.so/d8ee2d159ac34cfc85827ba5a0a8ae71?v=c714dec3742440cc91a8c38914f83b6b ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md index 6fae36d0ec..af951bd21a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/personio.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/personio.md @@ -57,7 +57,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init personio duckdb ``` @@ -102,16 +102,16 @@ For more information, read [Credentials](../../general-usage/credentials). 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python personio_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `personio`, you may also use @@ -127,7 +127,7 @@ For more information, read [Run a pipeline.](../../walkthroughs/run-a-pipeline) ### Source `personio_source` This `dlt` source returns data resources like `employees`, `absences`, `absence_types`, etc. -```python +```py @dlt.source(name="personio") def personio_source( client_id: str = dlt.secrets.value, @@ -158,8 +158,8 @@ def personio_source( This resource retrieves data on all the employees in a company. -```python - @dlt.resource(primary_key="id", write_disposition="merge") +```py +@dlt.resource(primary_key="id", write_disposition="merge") def employees( updated_at: dlt.sources.incremental[ pendulum.DateTime @@ -185,9 +185,10 @@ data incrementally from the Personio API to your preferred destination. ### Resource `absence_types` Simple resource, which retrieves a list of various types of employee absences. -```python +```py @dlt.resource(primary_key="id", write_disposition="replace") def absence_types(items_per_page: int = items_per_page) -> Iterable[TDataItem]: + ... ... ``` @@ -209,7 +210,7 @@ The transformer functions transform or process data from resources. The transformer function `employees_absences_balance` process data from the `employees` resource. It fetches and returns a list of the absence balances for each employee. -```python +```py @dlt.transformer( data_from=employees, write_disposition="merge", @@ -232,7 +233,7 @@ verified source. 1. Configure the [pipeline](../../general-usage/pipeline) by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="personio", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -242,14 +243,14 @@ verified source. 1. To load employee data: - ```python + ```py load_data = personio_source().with_resources("employees") print(pipeline.run(load_data)) ``` 1. To load data from all supported endpoints: - ```python + ```py load_data = personio_source() print(pipeline.run(load_data)) ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md index 9d1a5a0a02..9b2c8a640f 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pipedrive.md @@ -9,7 +9,7 @@ keywords: [pipedrive api, pipedrive verified source, pipedrive] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Pipedrive](https://developers.pipedrive.com/docs/api/v1) is a cloud-based sales Customer @@ -18,7 +18,7 @@ communication, and automate sales processes. This Pipedrive `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/pipedrive_pipeline.py) -loads data using “Pipedrive API” to the destination of your choice. +load data using the “Pipedrive API” to the destination of your choice. Sources and resources that can be loaded using this verified source are: @@ -53,7 +53,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init pipedrive duckdb ``` @@ -93,19 +93,19 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python pipedrive_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `pipedrive`, you may also use + For example, the `pipeline_name` for the above pipeline example is `pipedrive`, but you may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). @@ -138,12 +138,13 @@ Pipedrive API. This function returns a list of resources including activities, deals, custom_fields_mapping and other resources data from Pipedrive API. -```python +```py @dlt.source(name="pipedrive") def pipedrive_source( pipedrive_api_key: str = dlt.secrets.value, since_timestamp: Optional[Union[pendulum.DateTime, str]] = dlt.config.value, ) -> Iterator[DltResource]: + ... ``` `pipedrive_api_key`: Authentication token for Pipedrive, configured in ".dlt/secrets.toml". @@ -151,7 +152,7 @@ def pipedrive_source( `since_timestamp`: Starting timestamp for incremental loading. By default, complete history is loaded on the first run. And new data in subsequent runs. -> Note: Incremental loading can be enabled or disabled depending on user prefrences. +> Note: Incremental loading can be enabled or disabled depending on user preferences. ### Resource `iterator RECENTS_ENTITIES` @@ -159,7 +160,7 @@ This code generates resources for each entity in [RECENTS_ENTITIES](https://github.com/dlt-hub/verified-sources/blob/master/sources/pipedrive/settings.py), stores them in endpoints_resources, and then loads data from each endpoint to the destination. -```python +```py endpoints_resources = {} for entity, resource_name in RECENTS_ENTITIES.items(): endpoints_resources[resource_name] = dlt.resource( @@ -186,7 +187,7 @@ for entity, resource_name in RECENTS_ENTITIES.items(): This function gets the participants of deals from the Pipedrive API and yields the result. -```python +```py def pipedrive_source(args): # Rest of function yield endpoints_resources["deals"] | dlt.transformer( @@ -209,12 +210,13 @@ further processing or loading. This function preserves the mapping of custom fields across different pipeline runs. It is used to create and store a mapping of custom fields for different entities in the source state. -```python +```py @dlt.resource(selected=False) def create_state(pipedrive_api_key: str) -> Iterator[Dict[str, Any]]: def _get_pages_for_rename( entity: str, fields_entity: str, pipedrive_api_key: str ) -> Dict[str, Any]: + ... ``` It processes each entity in ENTITY_MAPPINGS, updating the custom fields mapping if a related fields @@ -238,7 +240,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="pipedrive", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -251,7 +253,7 @@ verified source. 1. To print source info: - ```python + ```py pipedrive_data = pipedrive_source() #print source info print(pipedrive_data) @@ -263,15 +265,15 @@ verified source. 1. To load all the data in Pipedrive: - ```python + ```py load_data = pipedrive_source() # calls the source function - load_info = pipeline.run(load_info) #runs the pipeline with selected source configuration + load_info = pipeline.run(load_data) #runs the pipeline with selected source configuration print(load_info) ``` 1. To load data from selected resources: - ```python + ```py #To load custom fields, include custom_fields_mapping for hash to name mapping. load_data = pipedrive_source().with_resources("products", "deals", "deals_participants", "custom_fields_mapping") load_info = pipeline.run(load_data) #runs the pipeline loading selected data @@ -280,7 +282,7 @@ verified source. 1. To load data from a start date: - ```python + ```py # Configure a source for 'activities' starting from the specified date. # The 'custom_fields_mapping' is incorporated to convert custom field hashes into their respective names. activities_source = pipedrive_source( @@ -293,4 +295,4 @@ verified source. ``` - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md index aa8fbe10d4..7d6b6e036a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md @@ -63,7 +63,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init salesforce duckdb ``` @@ -110,16 +110,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python salesforce_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `salesforce`, you may also use @@ -137,13 +137,14 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function returns a list of resources to load users, user_role, opportunity, opportunity_line_item, account etc. data from Salesforce API. -```python +```py @dlt.source(name="salesforce") def salesforce_source( user_name: str = dlt.secrets.value, password: str = dlt.secrets.value, security_token: str = dlt.secrets.value, ) ->Iterable[DltResource]: + ... ``` - `user_name`: Your Salesforce account username. @@ -156,7 +157,7 @@ def salesforce_source( This resource function retrieves records from the Salesforce "User" endpoint. -```python +```py @dlt.resource(write_disposition="replace") def sf_user() -> Iterator[Dict[str, Any]]: yield from get_records(client, "User") @@ -176,7 +177,7 @@ the "user_role" endpoint. This resource function retrieves records from the Salesforce "Opportunity" endpoint in incremental mode. -```python +```py @dlt.resource(write_disposition="merge") def opportunity( last_timestamp: Incremental[str] = dlt.sources.incremental( @@ -215,7 +216,7 @@ To create your data pipeline using single loading and 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="salesforce_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -228,7 +229,7 @@ To create your data pipeline using single loading and 1. To load data from all the endpoints, use the `salesforce_source` method as follows: - ```python + ```py load_data = salesforce_source() source.schema.merge_hints({"not_null": ["id"]}) # Hint for id field not null load_info = pipeline.run(load_data) @@ -241,7 +242,7 @@ To create your data pipeline using single loading and 1. To use the method `pipeline.run()` to load custom endpoints “candidates” and “members”: - ```python + ```py load_info = pipeline.run(load_data.with_resources("opportunity", "contact")) # print the information on data that was loaded print(load_info) @@ -260,7 +261,7 @@ To create your data pipeline using single loading and 1. To load data from the “contact” in replace mode and “task” incrementally merge mode endpoints: - ```python + ```py load_info = pipeline.run(load_data.with_resources("contact", "task")) # pretty print the information on data that was loaded print(load_info) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md index 09dc392c87..af00b17703 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md @@ -61,7 +61,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init shopify_dlt duckdb ``` @@ -125,16 +125,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python shopify_dlt_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is `shopify_data`, you may also @@ -152,7 +152,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function returns a list of resources to load products, orders, and customers data from Shopify API. -```python +```py def shopify_source( private_app_password: str = dlt.secrets.value, api_version: str = DEFAULT_API_VERSION, @@ -163,6 +163,7 @@ def shopify_source( items_per_page: int = DEFAULT_ITEMS_PER_PAGE, order_status: TOrderStatus = "any", ) -> Iterable[DltResource]: + ... ``` `private_app_password`: App's password for your shop. @@ -188,7 +189,7 @@ incremental loading if unspecified. This resource loads products from your Shopify shop into the destination. It supports incremental loading and pagination. -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def products( updated_at: dlt.sources.incremental[ @@ -202,6 +203,7 @@ def products( created_at_min: pendulum.DateTime = created_at_min_obj, items_per_page: int = items_per_page, ) -> Iterable[TDataItem]: + ... ``` `updated_at`: The saved [state](../../general-usage/state) of the last 'updated_at' value. @@ -212,7 +214,7 @@ support incremental loading and pagination. ### Resource `shopify_partner_query`: This resource can be used to run custom GraphQL queries to load paginated data. -```python +```py @dlt.resource def shopify_partner_query( query: str, @@ -224,6 +226,7 @@ def shopify_partner_query( organization_id: str = dlt.config.value, api_version: str = DEFAULT_PARTNER_API_VERSION, ) -> Iterable[TDataItem]: + ... ``` `query`: The GraphQL query for execution. @@ -251,7 +254,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="shopify", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -264,7 +267,7 @@ verified source. 1. To load data from "products", "orders" and "customers" from 1st Jan 2023. - ```python + ```py # Add your desired resources to the list... resources = ["products", "orders", "customers"] start_date="2023-01-01" @@ -278,7 +281,7 @@ verified source. minimizes potential failure during large data loads. Running chunks and incremental loads in parallel accelerates the initial load. - ```python + ```py # Load all orders from 2023-01-01 to now min_start_date = current_start_date = pendulum.datetime(2023, 1, 1) max_end_date = pendulum.now() @@ -310,7 +313,7 @@ verified source. print(load_info) ``` 1. To load the first 10 transactions via GraphQL query from the Shopify Partner API. - ```python + ```py # Construct query to load transactions 100 per page, the `$after` variable is used to paginate query = """query Transactions($after: String) { transactions(after: $after, first: 10) { diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md index 85fd3f2a3a..104eeff388 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/slack.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/slack.md @@ -9,24 +9,24 @@ keywords: [slack api, slack verified source, slack] :::info Need help deploying these sources, or figuring out how to run them in your data stack? [Join our Slack community](https://dlthub.com/community) -or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer Adrian. +or [book a call](https://calendar.app.google/kiLhuMsWKpZUpfho6) with our support engineer, Adrian. ::: [Slack](https://slack.com/) is a popular messaging and collaboration platform for teams and organizations. This Slack `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/slack_pipeline.py) -loads data using “Slack API” to the destination of your choice. +load data using the “Slack API” to the destination of your choice. Sources and resources that can be loaded using this verified source are: | Name | Description | |-----------------------|------------------------------------------------------------------------------------| -| slack | Retrives all the Slack data: channels, messages for selected channels, users, logs | -| channels | Retrives all the channels data | -| users | Retrives all the users info | -| get_messages_resource | Retrives all the messages for a given channel | -| access_logs | Retrives the access logs | +| slack | Retrieves all the Slack data: channels, messages for selected channels, users, logs | +| channels | Retrieves all the channels data | +| users | Retrieves all the users info | +| get_messages_resource | Retrieves all the messages for a given channel | +| access_logs | Retrieves the access logs | ## Setup Guide @@ -67,7 +67,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init slack duckdb ``` @@ -96,7 +96,7 @@ For more information, read the guide on [how to add a verified source](../../wal access_token = "Please set me up!" # please set me up! ``` -1. Copy the user Oauth token you [copied above](#grab-user-oauth-token). +1. Copy the user OAuth token you [copied above](#grab-user-oauth-token). 1. Finally, enter credentials for your chosen destination as per the [docs](../destinations/). @@ -107,20 +107,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python slack_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -138,7 +138,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage It retrieves data from Slack's API and fetches the Slack data such as channels, messages for selected channels, users, logs. -```python +```py @dlt.source(name="slack", max_table_nesting=2) def slack_source( page_size: int = MAX_PAGE_SIZE, @@ -147,6 +147,7 @@ def slack_source( end_date: Optional[TAnyDateTime] = None, selected_channels: Optional[List[str]] = dlt.config.value, ) -> Iterable[DltResource]: + ... ``` `page_size`: Maximum items per page (default: 1000). @@ -161,27 +162,29 @@ def slack_source( ### Resource `channels` -This function yields all the channels data as `dlt` resource. +This function yields all the channels data as a `dlt` resource. -```python +```py @dlt.resource(name="channels", primary_key="id", write_disposition="replace") def channels_resource() -> Iterable[TDataItem]: + ... ``` ### Resource `users` -This function yields all the users data as `dlt` resource. +This function yields all the users data as a `dlt` resource. -```python +```py @dlt.resource(name="users", primary_key="id", write_disposition="replace") def users_resource() -> Iterable[TDataItem]: + ... ``` ### Resource `get_messages_resource` -This method fetches messages for a specified channel from the Slack API. It creates a resource for each channel with channel's name. +This method fetches messages for a specified channel from the Slack API. It creates a resource for each channel with the channel's name. -```python +```py def get_messages_resource( channel_data: Dict[str, Any], created_at: dlt.sources.incremental[DateTime] = dlt.sources.incremental( @@ -191,6 +194,7 @@ def get_messages_resource( allow_external_schedulers=True, ), ) -> Iterable[TDataItem]: + ... ``` `channel_data`: A dictionary detailing a specific channel to determine where messages are fetched from. @@ -209,7 +213,7 @@ def get_messages_resource( This method retrieves access logs from the Slack API. -```python +```py @dlt.resource( name="access_logs", selected=False, @@ -218,6 +222,7 @@ This method retrieves access logs from the Slack API. ) # it is not an incremental resource it just has a end_date filter def logs_resource() -> Iterable[TDataItem]: + ... ``` `selected`: A boolean set to False, indicating the resource isn't loaded by default. @@ -235,7 +240,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="slack", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -244,7 +249,7 @@ verified source. ``` 1. To load Slack resources from the specified start date: - ```python + ```py source = slack_source(page_size=1000, start_date=datetime(2023, 9, 1), end_date=datetime(2023, 9, 8)) # Enable below to load only 'access_logs', available for paid accounts only. @@ -258,7 +263,7 @@ verified source. 1. To load data from selected Slack channels from the specified start date: - ```python + ```py # To load data from selected channels. selected_channels=["general", "random"] # Enter the channel names here. @@ -275,7 +280,7 @@ verified source. 1. To load only messages from selected Slack resources: - ```python + ```py # To load data from selected channels. selected_channels=["general", "random"] # Enter the channel names here. @@ -285,10 +290,10 @@ verified source. start_date=datetime(2023, 9, 1), end_date=datetime(2023, 9, 8), ) - # It loads only massages from the channel "general". + # It loads only messages from the channel "general". load_info = pipeline.run(source.with_resources("general")) print(load_info) ``` - \ No newline at end of file + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index 67965863ce..56fc826ce8 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -58,8 +58,8 @@ The database above doesn't require a password. The connection URL can be broken down into: -```python -connection_url = "connection_string = f"{drivername}://{username}:{password}@{host}:{port}/{database}" +```py +connection_url = connection_string = f"{drivername}://{username}:{password}@{host}:{port}{database}" ``` `drivername`: Indicates both the database system and driver used. @@ -116,7 +116,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init sql_database duckdb ``` @@ -158,7 +158,7 @@ For more information, read the guide on [how to add a verified source](../../wal 1. You can also pass credentials in the pipeline script the following way: - ```python + ```py credentials = ConnectionStringCredentials( "mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" ) @@ -176,19 +176,19 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Install the necessary dependencies by running the following command: - ```bash + ```sh pip install -r requirements.txt ``` 1. Run the verified source by entering: - ```bash + ```sh python sql_database_pipeline.py ``` 1. Make sure that everything is loaded as expected with: - ```bash + ```sh dlt pipeline show ``` @@ -208,7 +208,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage This function loads data from an SQL database via SQLAlchemy and auto-creates resources for each table or from a specified list of tables. -```python +```py @dlt.source def sql_database( credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value, @@ -220,6 +220,7 @@ def sql_database( defer_table_reflect: Optional[bool] = dlt.config.value, table_adapter_callback: Callable[[Table], None] = None, ) -> Iterable[DltResource]: + ... ``` `credentials`: Database details or an 'sqlalchemy.Engine' instance. @@ -244,7 +245,7 @@ remove certain columns to be selected. This function loads data from specific database tables. -```python +```py @dlt.common.configuration.with_config( sections=("sources", "sql_database"), spec=SqlTableResourceConfiguration ) @@ -259,6 +260,7 @@ def sql_table( defer_table_reflect: Optional[bool] = dlt.config.value, table_adapter_callback: Callable[[Table], None] = None, ) -> DltResource: + ... ``` `incremental`: Optional, enables incremental loading. @@ -284,7 +286,7 @@ certain range. 1. Consider a table with a `last_modified` timestamp column. By setting this column as your cursor and specifying an initial value, the loader generates a SQL query filtering rows with `last_modified` values greater than the specified initial value. - ```python + ```py from sql_database import sql_table from datetime import datetime @@ -303,7 +305,7 @@ certain range. 1. To incrementally load the "family" table using the sql_database source method: - ```python + ```py source = sql_database().with_resources("family") #using the "updated" field as an incremental field using initial value of January 1, 2022, at midnight source.family.apply_hints(incremental=dlt.sources.incremental("updated"),initial_value=pendulum.DateTime(2022, 1, 1, 0, 0, 0)) @@ -315,7 +317,7 @@ certain range. 1. To incrementally load the "family" table using the 'sql_table' resource. - ```python + ```py family = sql_table( table="family", incremental=dlt.sources.incremental( @@ -342,7 +344,7 @@ When running on Airflow ### Parallel extraction You can extract each table in a separate thread (no multiprocessing at this point). This will decrease loading time if your queries take time to execute or your network latency/speed is low. -```python +```py database = sql_database().parallelize() table = sql_table().parallelize() ``` @@ -358,7 +360,7 @@ To create your own pipeline, use source and resource methods from this verified 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="rfam", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -370,7 +372,7 @@ To create your own pipeline, use source and resource methods from this verified 1. To load the entire database, use the `sql_database` source as: - ```python + ```py source = sql_database() info = pipeline.run(source, write_disposition="replace") print(info) @@ -378,7 +380,7 @@ To create your own pipeline, use source and resource methods from this verified 1. If you just need the "family" table, use: - ```python + ```py source = sql_database().with_resources("family") #running the pipeline info = pipeline.run(source, write_disposition="replace") @@ -389,7 +391,7 @@ To create your own pipeline, use source and resource methods from this verified [documentation](https://dlthub.com/docs/general-usage/customising-pipelines/pseudonymizing_columns). As an example, here's how to pseudonymize the "rfam_acc" column in the "family" table: - ```python + ```py import hashlib def pseudonymize_name(doc): @@ -421,7 +423,7 @@ To create your own pipeline, use source and resource methods from this verified 1. To exclude columns, such as the "rfam_id" column from the "family" table before loading: - ```python + ```py def remove_columns(doc): del doc["rfam_id"] return doc diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md index 4ddf20aa78..0ac1fe7acf 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md @@ -50,7 +50,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init strapi duckdb ``` @@ -73,7 +73,7 @@ For more information, read the guide on [how to add a verified source](../../wal information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py # put your secret values and credentials here. do not share this file and do not push it to github [sources.strapi] api_secret_key = "api_secret_key" # please set me up! @@ -96,13 +96,13 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python strapi_pipeline.py ``` @@ -113,7 +113,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -131,13 +131,14 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function retrives data from Strapi. -```python +```py @dlt.source def strapi_source( endpoints: List[str], api_secret_key: str = dlt.secrets.value, domain: str = dlt.secrets.value, ) -> Iterable[DltResource]: + ... ``` `endpoints`: Collections to fetch data from. @@ -155,7 +156,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="strapi", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -165,7 +166,7 @@ verified source. 1. To load the specified endpoints: - ```python + ```py endpoints = ["athletes"] load_data = strapi_source(endpoints=endpoints) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index 0b172dc3be..118c0e6511 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -56,7 +56,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init stripe_analytics duckdb ``` @@ -96,20 +96,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python stripe_analytics_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -127,7 +127,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug You can write your own pipelines to load data to a destination using this verified source. However, it is important to note is how the `ENDPOINTS` and `INCREMENTAL_ENDPOINTS` tuples are defined in `stripe_analytics/settings.py`. -```python +```py # The most popular Stripe API's endpoints ENDPOINTS = ("Subscription", "Account", "Coupon", "Customer", "Product", "Price") # Possible incremental endpoints @@ -140,7 +140,7 @@ INCREMENTAL_ENDPOINTS = ("Event", "Invoice", "BalanceTransaction") This function retrieves data from the Stripe API for the specified endpoint: -```python +```py @dlt.source def stripe_source( endpoints: Tuple[str, ...] = ENDPOINTS, @@ -148,6 +148,7 @@ def stripe_source( start_date: Optional[DateTime] = None, end_date: Optional[DateTime] = None, ) -> Iterable[DltResource]: + ... ``` - `endpoints`: Tuple containing endpoint names. @@ -159,7 +160,7 @@ def stripe_source( This source loads data in 'append' mode from incremental endpoints. -```python +```py @dlt.source def incremental_stripe_source( endpoints: Tuple[str, ...] = INCREMENTAL_ENDPOINTS, @@ -167,6 +168,7 @@ def incremental_stripe_source( initial_start_date: Optional[DateTime] = None, end_date: Optional[DateTime] = None, ) -> Iterable[DltResource]: + ... ``` `endpoints`: Tuple containing incremental endpoint names. @@ -183,9 +185,10 @@ For more information, read the [General Usage: Incremental loading](../../genera This function loads a dictionary with calculated metrics, including MRR and Churn rate, along with the current timestamp. -```python +```py @dlt.resource(name="Metrics", write_disposition="append", primary_key="created") def metrics_resource() -> Iterable[TDataItem]: + ... ``` Abrevations MRR and Churn rate are as follows: @@ -203,7 +206,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="stripe_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -213,7 +216,7 @@ verified source. 1. To load endpoints like "Plan" and "Charge" in replace mode, retrieve all data for the year 2022: - ```python + ```py source_single = stripe_source( endpoints=("Plan", "Charge"), start_date=datetime(2022, 1, 1), @@ -225,7 +228,7 @@ verified source. 1. To load data from the "Invoice" endpoint, which has static data, using incremental loading: - ```python + ```py # Load all data on the first run that was created after start_date and before end_date source_incremental = incremental_stripe_source( endpoints=("Invoice", ), @@ -239,7 +242,7 @@ verified source. 1. To load data created after December 31, 2022, adjust the data range for stripe_source to prevent redundant loading. For incremental_stripe_source, the initial_start_date will auto-update to the last loaded date from the previous run. - ```python + ```py source_single = stripe_source( endpoints=("Plan", "Charge"), start_date=datetime(2022, 12, 31), @@ -254,7 +257,7 @@ verified source. 1. To load important metrics and store them in database: - ```python + ```py # Event is an endpoint with uneditable data, so we can use 'incremental_stripe_source'. source_event = incremental_stripe_source(endpoints=("Event",)) # Subscription is an endpoint with editable data, use stripe_source. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index 8701db7db8..dc4c1936f9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -65,7 +65,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init workable duckdb ``` @@ -117,20 +117,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python workable_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -146,7 +146,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug Note the default definitions of DEFAULT_ENDPOINTS and DEFAULT_DETAILS in "workable/settings.py". -```python +```py DEFAULT_ENDPOINTS = ("members", "recruiters", "stages", "requisitions", "jobs", "custom_attributes","events") DEFAULT_DETAILS = { @@ -164,7 +164,7 @@ endpoints allow incremental 'merge' mode loading. This source returns a sequence of dltResources that correspond to the endpoints. -```python +```py @dlt.source(name="workable") def workable_source( access_token: str = dlt.secrets.value, @@ -172,6 +172,7 @@ def workable_source( start_date: Optional[DateTime] = None, load_details: bool = False, ) -> Iterable[DltResource]: + ... ``` `access_token`: Authenticate the Workable API using the token specified in ".dlt/secrets.toml". @@ -187,13 +188,14 @@ def workable_source( This function is used to retrieve "candidates" endpoints. -```python +```py @dlt.resource(name="candidates", write_disposition="merge", primary_key="id") def candidates_resource( updated_at: Optional[Any] = dlt.sources.incremental( "updated_at", initial_value=workable.start_date_iso ) ) -> Iterable[TDataItem]: + ... ``` `updated_at`: Uses the dlt.sources.incremental method. Defaults to the function's start_date or Jan @@ -211,7 +213,7 @@ To create your data pipeline using single loading and 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="workable", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -221,7 +223,7 @@ To create your data pipeline using single loading and 1. To load all data: - ```python + ```py load_data = workable_source() load_info = pipeline.run(load_data) print(load_info) @@ -232,7 +234,7 @@ To create your data pipeline using single loading and 1. To load data from a specific date, including dependent endpoints: - ```python + ```py load_data = workable_source(start_date=datetime(2022, 1, 1), load_details=True) load_info = pipeline.run(load_data) print(load_info) @@ -244,8 +246,8 @@ To create your data pipeline using single loading and 1. To load custom endpoints “candidates” and “members”: - ```python - load_info = pipeline.run(load_data.with_resources("candidates", "members") + ```py + load_info = pipeline.run(load_data.with_resources("candidates", "members")) # print the information on data that was loaded print(load_info) ``` @@ -255,7 +257,7 @@ To create your data pipeline using single loading and 1. To load data from the “jobs” endpoint and its dependent endpoints like "activities" and "application_form": - ```python + ```py load_data = workable_source(start_date=datetime(2022, 2, 1), load_details=True) # Set the load_details as True to load all the dependent endpoints. load_info = pipeline.run(load_data.with_resources("jobs","jobs_activities","jobs_application_form")) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md index 234483dca0..11567306d9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md @@ -84,7 +84,7 @@ Here's a summarized version: 1. To get full token using the client id obtained above, you can follow the [instructions here.](https://developer.zendesk.com/documentation/ticketing/working-with-oauth/creating-and-using-oauth-tokens-with-the-api/#creating-the-access-token) - ```curl + ```sh curl https://{subdomain}.zendesk.com/api/v2/oauth/tokens.json \ -X POST \ -v -u {email_address}:{password} \ @@ -129,7 +129,7 @@ To generate Zendesk chat OAuth token, please refer to this 1. Record the "CLIENT_ID" and "SUBDOMAIN". 1. Format the below URL with your own CLIENT_ID and SUBDOMAIN, paste it into a new browser tab, and press Enter. - ```bash + ```sh https://www.zopim.com/oauth2/authorizations/new?response_type=token&client_id=CLIENT_ID&scope=read%20write&subdomain=SUBDOMAIN ``` 1. The call will be made, possibly asking you to log in and select 'Allow' to generate the token. @@ -160,7 +160,7 @@ To get started with your data pipeline, follow these steps: 1. Enter the following command: - ```bash + ```sh dlt init zendesk duckdb ``` @@ -183,7 +183,7 @@ For more information, read the guide on [how to add a verified source.](../../wa information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py #Zendesk support credentials [sources.zendesk.credentials] subdomain = "subdomain" # Zendesk subdomain @@ -215,20 +215,20 @@ For more information, read the [General Usage: Credentials.](../../general-usage 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: - ```bash + ```sh pip install -r requirements.txt ``` 1. You're now ready to run the pipeline! To get started, run the following command: - ```bash + ```sh python zendesk_pipeline.py ``` 1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: - ```bash + ```sh dlt pipeline show ``` @@ -246,13 +246,14 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug This function retrieves data from Zendesk Talk for phone calls and voicemails. -```python +```py @dlt.source(max_table_nesting=2) def zendesk_talk( credentials: TZendeskCredentials = dlt.secrets.value, start_date: Optional[TAnyDateTime] = DEFAULT_START_DATE, end_date: Optional[TAnyDateTime] = None, ) -> Iterable[DltResource]: + ... ``` `credentials`: Authentication credentials. @@ -266,13 +267,14 @@ run. This function loads data from Zendesk talk endpoint. -```python +```py def talk_resource( zendesk_client: ZendeskAPIClient, talk_endpoint_name: str, talk_endpoint: str, pagination_type: PaginationType, ) -> Iterator[TDataItem]: + ... ``` `zendesk_client`: An instance of ZendeskAPIClient for making API calls to Zendesk Talk. @@ -305,7 +307,7 @@ verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: - ```python + ```py pipeline = dlt.pipeline( pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) @@ -315,7 +317,7 @@ verified source. 1. To load data related to support, talk and chat: - ```python + ```py #zendesk support source function data_support = zendesk_support(load_all=True) # zendesk chat source function @@ -324,23 +326,23 @@ verified source. data_talk = zendesk_talk() # run pipeline with all 3 sources info = pipeline.run([data_support,data_chat,data_talk]) - return info + print(info) ``` 1. To load data related to support, chat and talk in incremental mode: - ```python - pipeline = dlt.pipeline( - pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired - destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) - full_refresh = Fasle - dataset_name="sample_zendesk_data" # Use a custom name if desired + ```py + pipeline = dlt.pipeline( + pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired + destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) + full_refresh = False, + dataset_name="sample_zendesk_data" # Use a custom name if desired ) - data = zendesk_support(load_all=True, start_date=start_date) - data_chat = zendesk_chat(start_date=start_date) - data_talk = zendesk_talk(start_date=start_date) - info = pipeline.run(data=[data, data_chat, data_talk]) - return info + data = zendesk_support(load_all=True, start_date=start_date) + data_chat = zendesk_chat(start_date=start_date) + data_talk = zendesk_talk(start_date=start_date) + info = pipeline.run(data=[data, data_chat, data_talk]) + print(info) ``` > Supports incremental loading for Support, Chat, and Talk Endpoints. By default, it fetches data @@ -350,7 +352,7 @@ verified source. 1. To load historical data in weekly ranges from Jan 1st, 2023, then switch to incremental loading for new tickets. - ```python + ```py # Load ranges of dates to load between January 1st 2023 and today min_start_date = pendulum.DateTime(year=2023, month=1, day=1).in_timezone("UTC") max_end_date = pendulum.today() diff --git a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md index c61805423b..ffe0abd082 100644 --- a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md +++ b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md @@ -12,7 +12,7 @@ To do so, run the [cli command](../../reference/command-line-interface.md#show-t below with your pipeline name. The pipeline name is the name of the Python file where your pipeline is defined and also displayed in your terminal when loading: -```bash +```sh dlt pipeline {pipeline_name} show ``` @@ -33,7 +33,7 @@ pipeline and hide many intricacies of correctly setting up the connection to you Execute any SQL query and get results following the Python [dbapi](https://peps.python.org/pep-0249/) spec. Below we fetch data from the customers table: -```python +```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") with pipeline.sql_client() as client: with client.execute_query( @@ -54,7 +54,7 @@ natively (i.e. BigQuery and DuckDB), `dlt` uses the native method. Thanks to tha frames may be really fast! The example below reads GitHub reactions data from the `issues` table and counts reaction types. -```python +```py pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", @@ -79,14 +79,14 @@ The native connection to your destination like BigQuery `Client` or DuckDB `Duck available in case you want to do anything special. Below we take the native connection to `duckdb` to get `DuckDBPyRelation` from a query: -```python +```py import dlt import duckdb pipeline = dlt.pipeline(destination="duckdb", dataset_name="github_reactions") with pipeline.sql_client() as client: conn = client.native_connection - rel = conn.sql('SELECT * FROM issues'); + rel = conn.sql('SELECT * FROM issues') rel.limit(3).show() ``` diff --git a/docs/website/docs/examples/chess_production/index.md b/docs/website/docs/examples/chess_production/index.md index d80558e745..ac305e943b 100644 --- a/docs/website/docs/examples/chess_production/index.md +++ b/docs/website/docs/examples/chess_production/index.md @@ -179,7 +179,7 @@ def load_data_with_retry(pipeline, data): :::warning To run this example you need to provide Slack incoming hook in `.dlt/secrets.toml`: -```python +```py [runtime] slack_incoming_hook="https://hooks.slack.com/services/***" ``` diff --git a/docs/website/docs/examples/custom_destination_bigquery/__init__.py b/docs/website/docs/examples/custom_destination_bigquery/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/config.toml b/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/config.toml new file mode 100644 index 0000000000..be627e6c11 --- /dev/null +++ b/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/config.toml @@ -0,0 +1,2 @@ +# @@@DLT_SNIPPET_START example +# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/example.secrets.toml b/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/example.secrets.toml new file mode 100644 index 0000000000..71f41f9878 --- /dev/null +++ b/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/example.secrets.toml @@ -0,0 +1,10 @@ +# @@@DLT_SNIPPET_START example +[destination.bigquery.credentials] +client_email = "" +private_key = "" +project_id = "" +token_uri = "" +refresh_token = "" +client_id = "" +client_secret = "" +# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/custom_destination_bigquery/code/__init__.py b/docs/website/docs/examples/custom_destination_bigquery/code/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/examples/custom_destination_bigquery/code/custom_destination_bigquery-snippets.py b/docs/website/docs/examples/custom_destination_bigquery/code/custom_destination_bigquery-snippets.py new file mode 100644 index 0000000000..16ff9c22b8 --- /dev/null +++ b/docs/website/docs/examples/custom_destination_bigquery/code/custom_destination_bigquery-snippets.py @@ -0,0 +1,81 @@ +from tests.utils import skipifgithubfork +from tests.pipeline.utils import assert_load_info + + +@skipifgithubfork +def custom_destination_biquery_snippet() -> None: + # @@@DLT_SNIPPET_START example + import dlt + import pandas as pd + import pyarrow as pa + from google.cloud import bigquery + + from dlt.common.configuration.specs import GcpServiceAccountCredentials + + # constants + OWID_DISASTERS_URL = ( + "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv" + ) + # this table needs to be manually created in your gc account + # format: "your-project.your_dataset.your_table" + BIGQUERY_TABLE_ID = "chat-analytics-rasa-ci.ci_streaming_insert.natural-disasters" + + # dlt sources + @dlt.resource(name="natural_disasters") + def resource(url: str): + # load pyarrow table with pandas + table = pa.Table.from_pandas(pd.read_csv(url)) + # we add a list type column to demontrate bigquery lists + table = table.append_column( + "tags", + pa.array( + [["disasters", "earthquakes", "floods", "tsunamis"]] * len(table), + pa.list_(pa.string()), + ), + ) + # we add a struct type column to demonstrate bigquery structs + table = table.append_column( + "meta", + pa.array( + [{"loaded_by": "dlt"}] * len(table), + pa.struct([("loaded_by", pa.string())]), + ), + ) + yield table + + # dlt biquery custom destination + # we can use the dlt provided credentials class + # to retrieve the gcp credentials from the secrets + @dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0) + def bigquery_insert( + items, table, credentials: GcpServiceAccountCredentials = dlt.secrets.value + ) -> None: + client = bigquery.Client( + credentials.project_id, credentials.to_native_credentials(), location="US" + ) + job_config = bigquery.LoadJobConfig( + autodetect=True, + source_format=bigquery.SourceFormat.PARQUET, + schema_update_options=bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, + ) + # since we have set the batch_size to 0, we get a filepath and can load the file directly + with open(items, "rb") as f: + load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config) + load_job.result() # Waits for the job to complete. + + __name__ = "__main__" # @@@DLT_REMOVE + if __name__ == "__main__": + # run the pipeline and print load results + pipeline = dlt.pipeline( + pipeline_name="csv_to_bigquery_insert", + destination=bigquery_insert, + dataset_name="mydata", + full_refresh=True, + ) + load_info = pipeline.run(resource(url=OWID_DISASTERS_URL)) + + print(load_info) + # @@@DLT_SNIPPET_END example + assert_load_info(load_info) diff --git a/docs/website/docs/examples/custom_destination_bigquery/index.md b/docs/website/docs/examples/custom_destination_bigquery/index.md new file mode 100644 index 0000000000..0531da23b1 --- /dev/null +++ b/docs/website/docs/examples/custom_destination_bigquery/index.md @@ -0,0 +1,119 @@ +--- +title: Custom destination with BigQuery +description: Learn how use the custom destination to load to bigquery and use credentials +keywords: [destination, credentials, example, bigquery, custom destination] +--- + +import Header from '../_examples-header.md'; + +
+ +## Custom destination BigQuery pipeline + +In this example, you'll find a Python script that demonstrates how to load Google Sheets data using the `dlt` library. + +We'll learn how to: +- use [built-in credentials](../../general-usage/credentials/config_specs#gcp-credentials) +- use the [custom destination](../../dlt-ecosystem/destinations/destination.md) +- Use pyarrow tables to create complex column types on bigquery +- Use bigquery autodetect=True for schema inference from parquet files + +### Your bigquery credentials in secrets.toml + +```toml +# you can just paste services.json as credentials +[destination.bigquery.credentials] +client_email = "" +private_key = "" +project_id = "" +token_uri = "" +refresh_token = "" +client_id = "" +client_secret = "" +``` + + + +### Pipeline code + + +```py +import dlt +import pandas as pd +import pyarrow as pa +from google.cloud import bigquery + +from dlt.common.configuration.specs import GcpServiceAccountCredentials + +# constants +OWID_DISASTERS_URL = ( + "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/" + "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv" +) +# this table needs to be manually created in your gc account +# format: "your-project.your_dataset.your_table" +BIGQUERY_TABLE_ID = "chat-analytics-rasa-ci.ci_streaming_insert.natural-disasters" + +# dlt sources +@dlt.resource(name="natural_disasters") +def resource(url: str): + # load pyarrow table with pandas + table = pa.Table.from_pandas(pd.read_csv(url)) + # we add a list type column to demontrate bigquery lists + table = table.append_column( + "tags", + pa.array( + [["disasters", "earthquakes", "floods", "tsunamis"]] * len(table), + pa.list_(pa.string()), + ), + ) + # we add a struct type column to demonstrate bigquery structs + table = table.append_column( + "meta", + pa.array( + [{"loaded_by": "dlt"}] * len(table), + pa.struct([("loaded_by", pa.string())]), + ), + ) + yield table + +# dlt biquery custom destination +# we can use the dlt provided credentials class +# to retrieve the gcp credentials from the secrets +@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0) +def bigquery_insert( + items, table, credentials: GcpServiceAccountCredentials = dlt.secrets.value +) -> None: + client = bigquery.Client( + credentials.project_id, credentials.to_native_credentials(), location="US" + ) + job_config = bigquery.LoadJobConfig( + autodetect=True, + source_format=bigquery.SourceFormat.PARQUET, + schema_update_options=bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, + ) + # since we have set the batch_size to 0, we get a filepath and can load the file directly + with open(items, "rb") as f: + load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config) + load_job.result() # Waits for the job to complete. + +if __name__ == "__main__": + # run the pipeline and print load results + pipeline = dlt.pipeline( + pipeline_name="csv_to_bigquery_insert", + destination=bigquery_insert, + dataset_name="mydata", + full_refresh=True, + ) + load_info = pipeline.run(resource(url=OWID_DISASTERS_URL)) + + print(load_info) + + assert_load_info(load_info) +``` + diff --git a/docs/website/docs/examples/google_sheets/index.md b/docs/website/docs/examples/google_sheets/index.md index 4af35f6dac..3bf3f858d8 100644 --- a/docs/website/docs/examples/google_sheets/index.md +++ b/docs/website/docs/examples/google_sheets/index.md @@ -27,7 +27,7 @@ This example is for educational purposes. For best practices, we recommend using ### Install Google client library -```shell +```sh pip install google-api-python-client ``` diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py index ff12a00fca..05ea18cb9e 100644 --- a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py +++ b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py @@ -140,4 +140,4 @@ def get_pages( # check that stuff was loaded row_counts = pipeline.last_trace.last_normalize_info.row_counts - assert row_counts["ticket_events"] == 17 \ No newline at end of file + assert row_counts["ticket_events"] == 17 diff --git a/docs/website/docs/examples/nested_data/index.md b/docs/website/docs/examples/nested_data/index.md index b2b5ee2792..8a5c17604c 100644 --- a/docs/website/docs/examples/nested_data/index.md +++ b/docs/website/docs/examples/nested_data/index.md @@ -26,7 +26,7 @@ We'll learn how to: ### Install pymongo -```shell +```sh pip install pymongo>=4.3.3 ``` diff --git a/docs/website/docs/examples/pdf_to_weaviate/index.md b/docs/website/docs/examples/pdf_to_weaviate/index.md index cc2ef01e33..5b889b858d 100644 --- a/docs/website/docs/examples/pdf_to_weaviate/index.md +++ b/docs/website/docs/examples/pdf_to_weaviate/index.md @@ -14,7 +14,7 @@ import Header from '../_examples-header.md'; Additionally we'll use PyPDF2 to extract text from PDFs. Make sure you have it installed: -```shell +```sh pip install PyPDF2 ``` diff --git a/docs/website/docs/examples/qdrant_zendesk/index.md b/docs/website/docs/examples/qdrant_zendesk/index.md index 7920619b26..b71840073b 100644 --- a/docs/website/docs/examples/qdrant_zendesk/index.md +++ b/docs/website/docs/examples/qdrant_zendesk/index.md @@ -28,7 +28,7 @@ First, configure the destination credentials for [Qdrant](https://dlthub.com/doc Next, make sure you have the following dependencies installed: -```commandline +```sh pip install qdrant-client>=1.6.9 pip install fastembed>=0.1.1 ``` @@ -170,13 +170,13 @@ response = qdrant_client.query( The query above gives stores the following results in the `response` variable: -```json +```py [QueryResponse(id='6aeacd21-b3d0-5174-97ef-5aaa59486414', embedding=None, metadata={'_dlt_id': 'Nx3wBiL29xTgaQ', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-09-01T11:19:25+00:00', 'custom_status_id': 12765028278545, 'description': 'I have been trying to cancel my subscription but the system won’t let me do it. Can you please help?', 'from_messaging_channel': False, 'generated_timestamp': 1693567167, 'group_id': 12765036328465, 'has_incidents': False, 'id': 12, 'is_public': True, 'organization_id': 12765041119505, 'raw_subject': 'Unable to Cancel Subscription', 'requester_id': 12765072569105, 'status': 'open', 'subject': 'Unable to Cancel Subscription', 'submitter_id': 12765072569105, 'tags': ['test1'], 'test_field': 'test1', 'ticket_form_id': 12765054772497, 'updated_at': '2023-09-01T11:19:25+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/12.json', 'via__channel': 'web'}, document='', score=0.89545774), QueryResponse(id='a22189c1-70ab-5421-938b-1caae3e7d6d8', embedding=None, metadata={'_dlt_id': 'bc/xloksL89EUg', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-07-18T17:23:42+00:00', 'custom_status_id': 12765028278545, 'description': 'ABCDEF', 'from_messaging_channel': False, 'generated_timestamp': 1689701023, 'group_id': 12765036328465, 'has_incidents': False, 'id': 4, 'is_public': True, 'organization_id': 12765041119505, 'raw_subject': 'What is this ticket', 'requester_id': 12765072569105, 'status': 'open', 'subject': 'What is this ticket', 'submitter_id': 12765072569105, 'tags': ['test1'], 'test_field': 'test1', 'ticket_form_id': 12765054772497, 'updated_at': '2023-07-18T17:23:42+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/4.json', 'via__channel': 'web'}, document='', score=0.8643349), QueryResponse(id='ce2f1c5c-41c3-56c3-a31d-2399a7a9239d', embedding=None, metadata={'_dlt_id': 'ZMuFJZo0AJxV4A', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-03-14T10:52:28+00:00', 'custom_status_id': 12765028278545, 'description': 'X', 'from_messaging_channel': False, 'generated_timestamp': 1696163084, 'group_id': 12765036328465, 'has_incidents': False, 'id': 2, 'is_public': True, 'priority': 'high', 'raw_subject': 'SCRUBBED', 'requester_id': 13726460510097, 'status': 'deleted', 'subject': 'SCRUBBED', 'submitter_id': 12765072569105, 'tags': [], 'ticket_form_id': 13726337882769, 'type': 'question', 'updated_at': '2023-09-01T12:10:35+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/2.json', 'via__channel': 'web'}, document='', score=0.8467072)] ``` To get a closer look at what the Zendesk ticket was, and how dlt dealt with it, we can index into the metadata of the first `QueryResponse` object: -```json lines +```py {'_dlt_id': 'Nx3wBiL29xTgaQ', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, diff --git a/docs/website/docs/general-usage/credentials/config_providers.md b/docs/website/docs/general-usage/credentials/config_providers.md index c0dc459da0..cf23b5d5dc 100644 --- a/docs/website/docs/general-usage/credentials/config_providers.md +++ b/docs/website/docs/general-usage/credentials/config_providers.md @@ -38,7 +38,7 @@ providers. ### Example -```python +```py @dlt.source def google_sheets( spreadsheet_id=dlt.config.value, @@ -99,6 +99,19 @@ the `private_key` for Google credentials. It will look 1. first in env variable `MY_SECTION__GCP_CREDENTIALS__PRIVATE_KEY` and if not found, 1. in `secrets.toml` with key `my_section.gcp_credentials.private_key`. + +:::info +While using Google secrets provider please make sure your pipeline name +contains no whitespace or any other punctuation characters except "-" and "_". + +Per Google the secret name can contain + + 1. Uppercase and lowercase letters, + 2. Numerals, + 3. Hyphens, + 4. Underscores. +::: + ### Environment provider Looks for the values in the environment variables. @@ -120,7 +133,7 @@ current Working Directory**. Example: If your working directory is `my_dlt_project` and your project has the following structure: -``` +```text my_dlt_project: | pipelines/ diff --git a/docs/website/docs/general-usage/credentials/config_specs.md b/docs/website/docs/general-usage/credentials/config_specs.md index 07e56b3e14..e93e1c466a 100644 --- a/docs/website/docs/general-usage/credentials/config_specs.md +++ b/docs/website/docs/general-usage/credentials/config_specs.md @@ -21,7 +21,7 @@ service account credentials, while `ConnectionStringCredentials` handles databas As an example, let's use `ConnectionStringCredentials` which represents a database connection string. -```python +```py from dlt.sources.credentials import ConnectionStringCredentials @dlt.source @@ -60,17 +60,17 @@ dsn.password="loader" You can explicitly provide credentials in various forms: -```python +```py query("SELECT * FROM customers", "postgres://loader@localhost:5432/dlt_data") # or -query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader"...}) +query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader"}) ``` ## Built in credentials We have some ready-made credentials you can reuse: -```python +```py from dlt.sources.credentials import ConnectionStringCredentials from dlt.sources.credentials import OAuth2Credentials from dlt.sources.credentials import GcpServiceAccountCredentials, GcpOAuthCredentials @@ -87,7 +87,7 @@ and additional query parameters. This class provides methods for parsing and generating connection strings. #### Usage -```python +```py credentials = ConnectionStringCredentials() # Set the necessary attributes @@ -117,7 +117,7 @@ client secret, refresh token, and access token. It also allows for the addition of scopes and provides methods for client authentication. Usage: -```python +```py credentials = OAuth2Credentials( client_id="CLIENT_ID", client_secret="CLIENT_SECRET", @@ -153,7 +153,7 @@ This class provides methods to retrieve native credentials for Google clients. - You may just pass the `service.json` as string or dictionary (in code and via config providers). - Or default credentials will be used. -```python +```py credentials = GcpServiceAccountCredentials() # Parse a native value (ServiceAccountCredentials) # Accepts a native value, which can be either an instance of ServiceAccountCredentials @@ -163,7 +163,7 @@ native_value = {"private_key": ".."} # or "path/to/services.json" credentials.parse_native_representation(native_value) ``` or more preferred use: -```python +```py import dlt from dlt.sources.credentials import GcpServiceAccountCredentials @@ -204,7 +204,7 @@ serialized OAuth client secrets JSON. This class provides methods for authentication and obtaining access tokens. ##### Usage -```python +```py oauth_credentials = GcpOAuthCredentials() # Accepts a native value, which can be either an instance of GoogleOAuth2Credentials @@ -214,7 +214,7 @@ native_value_oauth = {"client_secret": ...} oauth_credentials.parse_native_representation(native_value_oauth) ``` or more preferred use: -```python +```py import dlt from dlt.sources.credentials import GcpOAuthCredentials @@ -277,7 +277,7 @@ It inherits the ability to manage default credentials and extends it with method for handling partial credentials and converting credentials to a botocore session. #### Usage -```python +```py credentials = AwsCredentials() # Set the necessary attributes credentials.aws_access_key_id = "ACCESS_KEY_ID" @@ -285,7 +285,7 @@ credentials.aws_secret_access_key = "SECRET_ACCESS_KEY" credentials.region_name = "us-east-1" ``` or -```python +```py # Imports an external boto3 session and sets the credentials properties accordingly. import botocore.session @@ -295,7 +295,7 @@ credentials.parse_native_representation(session) print(credentials.aws_access_key_id) ``` or more preferred use: -```python +```py @dlt.source def aws_readers( bucket_url: str = dlt.config.value, @@ -340,14 +340,14 @@ handling partial credentials and converting credentials to a format suitable for interacting with Azure Blob Storage using the adlfs library. #### Usage -```python +```py credentials = AzureCredentials() # Set the necessary attributes credentials.azure_storage_account_name = "ACCOUNT_NAME" credentials.azure_storage_account_key = "ACCOUNT_KEY" ``` or more preferred use: -```python +```py @dlt.source def azure_readers( bucket_url: str = dlt.config.value, @@ -388,7 +388,7 @@ decorated function. Example: -```python +```py @dlt.source def zen_source(credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, str] = dlt.secrets.value, some_option: bool = False): # depending on what the user provides in config, ZenApiKeyCredentials or ZenEmailCredentials will be injected in `credentials` argument @@ -432,7 +432,7 @@ This is used a lot in the `dlt` core and may become useful for complicated sourc In fact, for each decorated function a spec is synthesized. In case of `google_sheets` following class is created: -```python +```py from dlt.sources.config import configspec, with_config @configspec diff --git a/docs/website/docs/general-usage/credentials/configuration.md b/docs/website/docs/general-usage/credentials/configuration.md index 9b2d392883..ec8e5fe32a 100644 --- a/docs/website/docs/general-usage/credentials/configuration.md +++ b/docs/website/docs/general-usage/credentials/configuration.md @@ -25,7 +25,7 @@ When done right you'll be able to run the same pipeline script during developmen In the example below, the `google_sheets` source function is used to read selected tabs from Google Sheets. It takes several arguments that specify the spreadsheet, the tab names and the Google credentials to be used when extracting data. -```python +```py @dlt.source def google_sheets( spreadsheet_id=dlt.config.value, @@ -68,14 +68,14 @@ You are free to call the function above as usual and pass all the arguments in t Instead let `dlt` to do the work and leave it to [injection mechanism](#injection-mechanism) that looks for function arguments in the config files or environment variables and adds them to your explicit arguments during a function call. Below are two most typical examples: 1. Pass spreadsheet id and tab names in the code, inject credentials from the secrets: - ```python + ```py data_source = google_sheets("23029402349032049", ["tab1", "tab2"]) ``` `credentials` value will be injected by the `@source` decorator (e.g. from `secrets.toml`). `spreadsheet_id` and `tab_names` take values from the call arguments. 2. Inject all the arguments from config / secrets - ```python + ```py data_source = google_sheets() ``` `credentials` value will be injected by the `@source` decorator (e.g. from **secrets.toml**). @@ -97,16 +97,16 @@ Where do the configs and secrets come from? By default, `dlt` looks in two **con Secrets in **.dlt/secrets.toml**. `dlt` will look for `credentials`, ```toml [credentials] - client_email = - private_key = - project_id = + client_email = "" + private_key = "" + project_id = "" ``` Note that **credentials** will be evaluated as dictionary containing **client_email**, **private_key** and **project_id** as keys. It is standard TOML behavior. - [Environment Variables](config_providers#environment-provider): - ```python - CREDENTIALS= - SPREADSHEET_ID=1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580 - TAB_NAMES=tab1,tab2 + ```toml + CREDENTIALS="" + SPREADSHEET_ID="1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580" + TAB_NAMES=["tab1", "tab2"] ``` We pass the JSON contents of `service.json` file to `CREDENTIALS` and we specify tab names as comma-delimited values. Environment variables are always in **upper case**. @@ -123,7 +123,7 @@ There are many ways you can organize your configs and secrets. The example above ### Do not hardcode secrets You should never do that. Sooner or later your private key will leak. -```python +```py # WRONG!: # provide all values directly - wrong but possible. # secret values should never be present in the code! @@ -137,7 +137,7 @@ data_source = google_sheets( ### Pass secrets in code from external providers You can get the secret values from your own providers. Below we take **credentials** for our `google_sheets` source from Airflow base hook: -```python +```py from airflow.hooks.base_hook import BaseHook # get it from airflow connections or other credential store @@ -163,7 +163,7 @@ Doing so provides several benefits: 1. You can request [built-in and custom credentials](config_specs.md) (i.e. connection strings, AWS / GCP / Azure credentials). 1. You can specify a set of possible types via `Union` i.e. OAuth or API Key authorization. -```python +```py @dlt.source def google_sheets( spreadsheet_id: str = dlt.config.value, @@ -171,7 +171,7 @@ def google_sheets( credentials: GcpServiceAccountCredentials = dlt.secrets.value, only_strings: bool = False ): - ... + ... ``` Now: @@ -189,7 +189,7 @@ In case of `GcpServiceAccountCredentials`: ## Read configs and secrets yourself `dlt.secrets` and `dlt.config` provide dictionary-like access to configuration values and secrets, respectively. -```python +```py # use `dlt.secrets` and `dlt.config` to explicitly take # those values from providers from the explicit keys data_source = google_sheets( @@ -202,14 +202,14 @@ data_source.run(destination="bigquery") ``` `dlt.config` and `dlt.secrets` behave like dictionaries from which you can request a value with any key name. `dlt` will look in all [config providers](#injection-mechanism) - TOML files, env variables etc. just like it does with the standard section layout. You can also use `dlt.config.get()` or `dlt.secrets.get()` to request value cast to a desired type. For example: -```python +```py credentials = dlt.secrets.get("my_section.gcp_credentials", GcpServiceAccountCredentials) ``` Creates `GcpServiceAccountCredentials` instance out of values (typically a dictionary) under **my_section.gcp_credentials** key. ### Write configs and secrets in code **dlt.config** and **dlt.secrets** can be also used as setters. For example: -```python +```py dlt.config["sheet_id"] = "23029402349032049" dlt.secrets["destination.postgres.credentials"] = BaseHook.get_connection('postgres_dsn').extra ``` @@ -263,9 +263,9 @@ Here is the simplest default layout for our `google_sheets` example. ```toml [credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` **config.toml** @@ -284,9 +284,9 @@ This makes sure that `google_sheets` source does not share any secrets and confi ```toml [sources.google_sheets.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` **config.toml** @@ -305,9 +305,9 @@ Use this if you want to read and pass the config/secrets yourself ```toml [my_section] - [my_section.gcp_credentials] - client_email = - private_key = +[my_section.gcp_credentials] +client_email = "" +private_key = "" ``` **config.toml** @@ -316,9 +316,9 @@ Use this if you want to read and pass the config/secrets yourself [my_section] tabs=["tab1", "tab2"] - [my_section.gcp_credentials] - # I prefer to keep my project id in config file and private key in secrets - project_id = +[my_section.gcp_credentials] +# I prefer to keep my project id in config file and private key in secrets +project_id = "" ``` ### Default layout and default key lookup during injection @@ -328,7 +328,7 @@ makes it easy to configure simple cases but also provides a room for more explic complex cases i.e. having several sources with different credentials or even hosting several pipelines in the same project sharing the same config and credentials. -``` +```text pipeline_name | |-sources @@ -368,15 +368,15 @@ Example: We use the `bigquery` destination and the `google_sheets` source. They ```toml # google sheet credentials [sources.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" # bigquery credentials [destination.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` Now when `dlt` looks for destination credentials, it will start with `destination.bigquery.credentials`, eliminate `bigquery` and stop at `destination.credentials`. @@ -388,21 +388,21 @@ Example: let's be even more explicit and use a full section path possible. ```toml # google sheet credentials [sources.google_sheets.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" # google analytics credentials [sources.google_analytics.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" # bigquery credentials [destination.bigquery.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` Now we can separate credentials for different sources as well. @@ -418,18 +418,18 @@ Example: the pipeline is named `ML_sheets`. ```toml [ML_sheets.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` or maximum path: ```toml [ML_sheets.sources.google_sheets.credentials] -client_email = -private_key = -project_id = +client_email = "" +private_key = "" +project_id = "" ``` ### The `sources` section @@ -455,7 +455,7 @@ Now we can finally understand the `ConfigFieldMissingException`. Let's run `chess.py` example without providing the password: -``` +```sh $ CREDENTIALS="postgres://loader@localhost:5432/dlt_data" python chess.py ... dlt.common.configuration.exceptions.ConfigFieldMissingException: Following fields are missing: ['password'] in configuration with spec PostgresCredentials diff --git a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md index 3f665bd0fb..ba0b13636b 100644 --- a/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/pseudonymizing_columns.md @@ -11,7 +11,7 @@ consistently achieve the same mapping. If instead you wish to anonymize, you can replace it with a constant. In the example below, we create a dummy source with a PII column called "name", which we replace with deterministic hashes (i.e. replacing the German umlaut). -```python +```py import dlt import hashlib diff --git a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md index 8493ffaec5..3163062ced 100644 --- a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md @@ -14,7 +14,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. Create a source function that creates dummy data as follows: - ```python + ```py import dlt # This function creates a dummy data source. @@ -31,7 +31,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. Next, create a function to filter out columns from the data before loading it into a database as follows: - ```python + ```py from typing import Dict, List, Optional def remove_columns(doc: Dict, remove_columns: Optional[List[str]] = None) -> Dict: @@ -53,7 +53,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. Next, declare the columns to be removed from the table, and then modify the source as follows: - ```python + ```py # Example columns to remove: remove_columns_list = ["country_code"] @@ -67,7 +67,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. ``` 1. You can optionally inspect the result: - ```python + ```py for row in data_source: print(row) #{'id': 0, 'name': 'Jane Washington 0'} @@ -77,7 +77,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. At last, create a pipeline: - ```python + ```py # Integrating with a DLT pipeline pipeline = dlt.pipeline( pipeline_name='example', diff --git a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md index e58dae6d9d..04e4d33b13 100644 --- a/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/renaming_columns.md @@ -12,7 +12,7 @@ In the example below, we create a dummy source with special characters in the na function that we intend to apply to the resource to modify its output (i.e. replacing the German umlaut): `replace_umlauts_in_dict_keys`. -```python +```py import dlt # create a dummy source with umlauts (special characters) in key names (um) diff --git a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md index 6b09510f68..f8bd179422 100644 --- a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md @@ -77,7 +77,7 @@ currency_conversion_enrichment/ 1. Here's the resource that yields the sample data as discussed above: - ```python + ```py @dlt.resource() def enriched_data_part_two(): data_enrichment_part_one = [ @@ -113,14 +113,14 @@ API token. information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py [sources] api_key= "Please set me up!" #ExchangeRate-API key ``` 1. Create the `converted_amount` function as follows: - ```python + ```py # @transformer(data_from=enriched_data_part_two) def converted_amount(record): """ @@ -210,7 +210,7 @@ API token. 1. Here, we create the pipeline and use the `add_map` functionality: - ```python + ```py # Create the pipeline pipeline = dlt.pipeline( pipeline_name="data_enrichment_two", @@ -229,7 +229,7 @@ API token. To do so, you need to add the transformer decorator at the top of the `converted_amount` function. For `pipeline.run`, you can use the following code: - ```python + ```py # using fetch_average_price as a transformer function load_info = pipeline.run( enriched_data_part_two | converted_amount, @@ -246,19 +246,19 @@ API token. 1. Install necessary dependencies for the preferred [destination](../../dlt-ecosystem/destinations/), For example, duckdb: - ``` + ```sh pip install dlt[duckdb] ``` 1. Run the pipeline with the following command: - ``` + ```sh python currency_enrichment_pipeline.py ``` 1. To ensure that everything loads as expected, use the command: - ``` + ```sh dlt pipeline show ``` diff --git a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md index f4578d065f..ab71d3d1d0 100644 --- a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md @@ -29,7 +29,7 @@ you can use any API you prefer. By default the URL Parse API will return a JSON response like: -```text +```json { "authority": "urlparse.com", "domain": "urlparse.com", @@ -73,7 +73,7 @@ understanding, you may explore all three enrichments sequentially in the noteboo Alternatively, to create a data enrichment pipeline, you can start by creating the following directory structure: -```python +```text url_parser_enrichment/ ├── .dlt/ │ └── secrets.toml @@ -100,41 +100,41 @@ Let's examine a synthetic dataset created for this article. It includes: Here's the resource that yields the sample data as discussed above: -```python - import dlt +```py + import dlt - @dlt.resource(write_disposition="append") - def tracked_data(): - """ - A generator function that yields a series of dictionaries, each representing - user tracking data. + @dlt.resource(write_disposition="append") + def tracked_data(): + """ + A generator function that yields a series of dictionaries, each representing + user tracking data. - This function is decorated with `dlt.resource` to integrate into the DLT (Data - Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to - ensure that data from this generator is appended to the existing data in the - destination table. + This function is decorated with `dlt.resource` to integrate into the DLT (Data + Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to + ensure that data from this generator is appended to the existing data in the + destination table. - Yields: - dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', - representing the user's tracking data including their device and the page - they were referred from. - """ + Yields: + dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', + representing the user's tracking data including their device and the page + they were referred from. + """ - # Sample data representing tracked user data - sample_data = [ + # Sample data representing tracked user data + sample_data = [ { "user_id": 1, "device_name": "Sony Experia XZ", "page_referer": "https://b2venture.lightning.force.com/" }, - """ - Data for other users - """ - ] - - # Yielding each user's data as a dictionary - for user_data in sample_data: - yield user_data + """ + Data for other users + """ + ] + + # Yielding each user's data as a dictionary + for user_data in sample_data: + yield user_data ``` ### 2. Create `url_parser` function @@ -143,7 +143,7 @@ We use a free service called [URL Parse API](https://urlparse.com/), to parse th need to register to use this service neither get an API key. 1. Create a `url_parser` function as follows: - ```python + ```py # @dlt.transformer(data_from=tracked_data) def url_parser(record): """ @@ -195,7 +195,7 @@ need to register to use this service neither get an API key. 1. Here, we create the pipeline and use the `add_map` functionality: - ```python + ```py # Create the pipeline pipeline = dlt.pipeline( pipeline_name="data_enrichment_three", @@ -214,7 +214,7 @@ need to register to use this service neither get an API key. do so, you need to add the transformer decorator at the top of the `url_parser` function. For `pipeline.run`, you can use the following code: - ```python + ```py # using fetch_average_price as a transformer function load_info = pipeline.run( tracked_data | url_parser, @@ -230,19 +230,19 @@ need to register to use this service neither get an API key. 1. Install necessary dependencies for the preferred [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: - ``` + ```sh pip install dlt[duckdb] ``` 1. Run the pipeline with the following command: - ``` + ```sh python url_enrichment_pipeline.py ``` 1. To ensure that everything loads as expected, use the command: - ``` + ```sh dlt pipeline show ``` diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md index 8b33a852a8..6b07845689 100644 --- a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md @@ -41,7 +41,7 @@ Here's the link to the notebook: ### B. Create a pipeline Alternatively, to create a data enrichment pipeline, you can start by creating the following directory structure: -```python +```text user_device_enrichment/ ├── .dlt/ │ └── secrets.toml @@ -67,42 +67,42 @@ user_device_enrichment/ Here's the resource that yields the sample data as discussed above: - ```python - import dlt - - @dlt.resource(write_disposition="append") - def tracked_data(): - """ - A generator function that yields a series of dictionaries, each representing - user tracking data. - - This function is decorated with `dlt.resource` to integrate into the DLT (Data - Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to - ensure that data from this generator is appended to the existing data in the - destination table. - - Yields: - dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', - representing the user's tracking data including their device and the page - they were referred from. - """ - - # Sample data representing tracked user data - sample_data = [ - {"user_id": 1, "device_name": "Sony Experia XZ", "page_referer": - "https://b2venture.lightning.force.com/"}, - {"user_id": 2, "device_name": "Samsung Galaxy S23 Ultra 5G", - "page_referer": "https://techcrunch.com/2023/07/20/can-dlthub-solve-the-python-library-problem-for-ai-dig-ventures-thinks-so/"}, - {"user_id": 3, "device_name": "Apple iPhone 14 Pro Max", - "page_referer": "https://dlthub.com/success-stories/freelancers-perspective/"}, - {"user_id": 4, "device_name": "OnePlus 11R", - "page_referer": "https://www.reddit.com/r/dataengineering/comments/173kp9o/ideas_for_data_validation_on_data_ingestion/"}, - {"user_id": 5, "device_name": "Google Pixel 7 Pro", "page_referer": "https://pypi.org/"}, - ] - - # Yielding each user's data as a dictionary - for user_data in sample_data: - yield user_data + ```py + import dlt + + @dlt.resource(write_disposition="append") + def tracked_data(): + """ + A generator function that yields a series of dictionaries, each representing + user tracking data. + + This function is decorated with `dlt.resource` to integrate into the DLT (Data + Loading Tool) pipeline. The `write_disposition` parameter is set to "append" to + ensure that data from this generator is appended to the existing data in the + destination table. + + Yields: + dict: A dictionary with keys 'user_id', 'device_name', and 'page_referer', + representing the user's tracking data including their device and the page + they were referred from. + """ + + # Sample data representing tracked user data + sample_data = [ + {"user_id": 1, "device_name": "Sony Experia XZ", "page_referer": + "https://b2venture.lightning.force.com/"}, + {"user_id": 2, "device_name": "Samsung Galaxy S23 Ultra 5G", + "page_referer": "https://techcrunch.com/2023/07/20/can-dlthub-solve-the-python-library-problem-for-ai-dig-ventures-thinks-so/"}, + {"user_id": 3, "device_name": "Apple iPhone 14 Pro Max", + "page_referer": "https://dlthub.com/success-stories/freelancers-perspective/"}, + {"user_id": 4, "device_name": "OnePlus 11R", + "page_referer": "https://www.reddit.com/r/dataengineering/comments/173kp9o/ideas_for_data_validation_on_data_ingestion/"}, + {"user_id": 5, "device_name": "Google Pixel 7 Pro", "page_referer": "https://pypi.org/"}, + ] + + # Yielding each user's data as a dictionary + for user_data in sample_data: + yield user_data ``` ### 2. Create `fetch_average_price` function @@ -118,7 +118,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the information securely, like access tokens. Keep this file safe. Here's its format for service account authentication: - ```python + ```py [sources] api_key= "Please set me up!" #Serp Api key. ``` @@ -126,7 +126,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Replace the value of the `api_key`. 1. Create `fetch_average_price()` function as follows: - ```python + ```py import datetime import requests @@ -247,7 +247,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Here, we create the pipeline and use the `add_map` functionality: - ```python + ```py # Create the pipeline pipeline = dlt.pipeline( pipeline_name="data_enrichment_one", @@ -266,7 +266,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the do so, you need to add the transformer decorator at the top of the `fetch_average_price` function. For `pipeline.run`, you can use the following code: - ```python + ```py # using fetch_average_price as a transformer function load_info = pipeline.run( tracked_data | fetch_average_price, @@ -283,19 +283,19 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the 1. Install necessary dependencies for the preferred [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: - ``` + ```sh pip install dlt[duckdb] ``` 1. Run the pipeline with the following command: - ``` + ```sh python device_enrichment_pipeline.py ``` 1. To ensure that everything loads as expected, use the command: - ``` + ```sh dlt pipeline show ``` diff --git a/docs/website/docs/general-usage/destination.md b/docs/website/docs/general-usage/destination.md index c20aa62d16..3f5eab479e 100644 --- a/docs/website/docs/general-usage/destination.md +++ b/docs/website/docs/general-usage/destination.md @@ -75,7 +75,7 @@ azure_storage_account_key="storage key" ``` or via environment variables: -``` +```sh DESTINATION__FILESYSTEM__BUCKET_URL=az://dlt-azure-bucket DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME=dltdata DESTINATION__FILESYSTEM__CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY="storage key" @@ -171,5 +171,7 @@ load_info.raise_on_failed_jobs() ::: -## Declare external destination -You can implement [your own destination](../walkthroughs/create-new-destination.md) and pass the destination class type or instance to `dlt` pipeline. \ No newline at end of file +## Create new destination +You have two ways to implement a new destination: +1. You can use `@dlt.destination` decorator and [implement a sink function](../dlt-ecosystem/destinations/destination.md). This is perfect way to implement reverse ETL destinations that push data back to REST APIs. +2. You can implement [a full destination](../walkthroughs/create-new-destination.md) where you have a full control over load jobs and schema migration. diff --git a/docs/website/docs/general-usage/full-loading.md b/docs/website/docs/general-usage/full-loading.md index 4651d156f0..320d0664f5 100644 --- a/docs/website/docs/general-usage/full-loading.md +++ b/docs/website/docs/general-usage/full-loading.md @@ -13,7 +13,7 @@ that are not selected while performing a full load will not replace any data in To perform a full load on one or more of your resources, choose the `write_disposition='replace'` for this resource: -```python +```py p = dlt.pipeline(destination="bigquery", dataset_name="github") issues = [] reactions = ["%2B1", "-1", "smile", "tada", "thinking_face", "heart", "rocket", "eyes"] diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 37b5963431..fe3bb8b61d 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -64,7 +64,7 @@ child tables. Example below loads all the GitHub events and updates them in the destination using "id" as primary key, making sure that only a single copy of event is present in `github_repo_events` table: -```python +```py @dlt.resource(primary_key="id", write_disposition="merge") def github_repo_events(): yield from _get_event_pages() @@ -72,26 +72,28 @@ def github_repo_events(): You can use compound primary keys: -```python +```py @dlt.resource(primary_key=("id", "url"), write_disposition="merge") -... +def resource(): + ... ``` By default, `primary_key` deduplication is arbitrary. You can pass the `dedup_sort` column hint with a value of `desc` or `asc` to influence which record remains after deduplication. Using `desc`, the records sharing the same `primary_key` are sorted in descending order before deduplication, making sure the record with the highest value for the column with the `dedup_sort` hint remains. `asc` has the opposite behavior. -```python +```py @dlt.resource( primary_key="id", write_disposition="merge", columns={"created_at": {"dedup_sort": "desc"}} # select "latest" record ) -... +def resource(): + ... ``` Example below merges on a column `batch_day` that holds the day for which given record is valid. Merge keys also can be compound: -```python +```py @dlt.resource(merge_key="batch_day", write_disposition="merge") def get_daily_batch(day): yield _get_batch_from_bucket(day) @@ -101,7 +103,7 @@ As with any other write disposition you can use it to load data ad hoc. Below we top reactions for `duckdb` repo. The lists have, obviously, many overlapping issues, but we want to keep just one instance of each. -```python +```py p = dlt.pipeline(destination="bigquery", dataset_name="github") issues = [] reactions = ["%2B1", "-1", "smile", "tada", "thinking_face", "heart", "rocket", "eyes"] @@ -117,7 +119,7 @@ Example below dispatches GitHub events to several tables by event type, keeps on by "id" and skips loading of past records using "last value" incremental. As you can see, all of this we can just declare in our resource. -```python +```py @dlt.resource(primary_key="id", write_disposition="merge", table_name=lambda i: i['type']) def github_repo_events(last_created_at = dlt.sources.incremental("created_at", "1970-01-01T00:00:00Z")): """A resource taking a stream of github events and dispatching them to tables named by event type. Deduplicates be 'id'. Loads incrementally by 'created_at' """ @@ -134,7 +136,7 @@ Each record in the destination table with the same `primary_key` or `merge_key` Deletes are propagated to any child table that might exist. For each record that gets deleted in the root table, all corresponding records in the child table(s) will also be deleted. Records in parent and child tables are linked through the `root key` that is explained in the next section. #### Example: with primary key and boolean delete column -```python +```py @dlt.resource( primary_key="id", write_disposition="merge", @@ -157,11 +159,11 @@ def resource(): ``` #### Example: with merge key and non-boolean delete column -```python +```py @dlt.resource( merge_key="id", write_disposition="merge", - columns={"deleted_at_ts": {"hard_delete": True}}} + columns={"deleted_at_ts": {"hard_delete": True}}) def resource(): # this will insert two records yield [ @@ -175,11 +177,11 @@ def resource(): ``` #### Example: with primary key and "dedup_sort" hint -```python +```py @dlt.resource( primary_key="id", write_disposition="merge", - columns={"deleted_flag": {"hard_delete": True}, "lsn": {"dedup_sort": "desc"}} + columns={"deleted_flag": {"hard_delete": True}, "lsn": {"dedup_sort": "desc"}}) def resource(): # this will insert one record (the one with lsn = 3) yield [ @@ -204,7 +206,7 @@ tables. This concept is similar to foreign key which references a parent table, set. We do not enable it everywhere because it takes storage space. Nevertheless, is some cases you may want to permanently enable root key propagation. -```python +```py pipeline = dlt.pipeline( pipeline_name='facebook_insights', destination='duckdb', @@ -243,7 +245,7 @@ Once you've figured that out, `dlt` takes care of finding maximum/minimum cursor duplicates and managing the state with last values of cursor. Take a look at GitHub example below, where we request recently created issues. -```python +```py @dlt.resource(primary_key="id") def repo_issues( access_token, @@ -280,7 +282,7 @@ In the example below we incrementally load the GitHub events, where API does not let us filter for the newest events - it always returns all of them. Nevertheless, `dlt` will load only the new items, filtering out all the duplicates and past issues. -```python +```py # use naming function in table name to generate separate tables for each event @dlt.resource(primary_key="id", table_name=lambda i: i['type']) # type: ignore def repo_events( @@ -320,7 +322,7 @@ and lets you select nested and complex data (including the whole data item when Example below creates last value which is a dictionary holding a max `created_at` value for each created table name: -```python +```py def by_event_type(event): last_value = None if len(event) == 1: @@ -344,7 +346,7 @@ def get_events(last_created_at = dlt.sources.incremental("$", last_value_func=by ### Using `end_value` for backfill You can specify both initial and end dates when defining incremental loading. Let's go back to our Github example: -```python +```py @dlt.resource(primary_key="id") def repo_issues( access_token, @@ -365,7 +367,7 @@ Please note that when `end_date` is specified, `dlt` **will not modify the exist To define specific ranges to load, you can simply override the incremental argument in the resource, for example: -```python +```py july_issues = repo_issues( created_at=dlt.sources.incremental( initial_value='2022-07-01T00:00:00Z', end_value='2022-08-01T00:00:00Z' @@ -410,7 +412,7 @@ The github events example is exactly such case. The results are ordered on curso In the same fashion the `row_order` can be used to **optimize backfill** so we don't continue making unnecessary API requests after the end of range is reached. For example: -```python +```py @dlt.resource(primary_key="id") def tickets( zendesk_client, @@ -443,7 +445,7 @@ incremental and exit yield loop when true. The `dlt.sources.incremental` instance provides `start_out_of_range` and `end_out_of_range` attributes which are set when the resource yields an element with a higher/lower cursor value than the initial or end values. If you do not want `dlt` to stop processing automatically and instead to handle such events yourself, do not specify `row_order`: -```python +```py @dlt.transformer(primary_key="id") def tickets( zendesk_client, @@ -465,16 +467,25 @@ def tickets( ``` ::: -### Deduplication primary_key +### Deduplicate overlapping ranges with primary key -`dlt.sources.incremental` will inherit the primary key that is set on the resource. +`Incremental` **does not** deduplicate datasets like **merge** write disposition does. It however +makes sure than when another portion of data is extracted, records that were previously loaded won't be +included again. `dlt` assumes that you load a range of data, where the lower bound is inclusive (ie. greater than equal). +This makes sure that you never lose any data but will also re-acquire some rows. +For example: you have a database table with an cursor field on `updated_at` which has a day resolution, then there's a high +chance that after you extract data on a given day, still more records will be added. When you extract on the next day, you +should reacquire data from the last day to make sure all records are present, this will however create overlap with data +from previous extract. - let's you optionally set a `primary_key` that is used exclusively to +By default, content hash (a hash of `json` representation of a row) will be used to deduplicate. +This may be slow so`dlt.sources.incremental` will inherit the primary key that is set on the resource. +You can optionally set a `primary_key` that is used exclusively to deduplicate and which does not become a table hint. The same setting lets you disable the deduplication altogether when empty tuple is passed. Below we pass `primary_key` directly to `incremental` to disable deduplication. That overrides `delta` primary_key set in the resource: -```python +```py @dlt.resource(primary_key="delta") # disable the unique value check by passing () as primary key to incremental def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())): @@ -487,7 +498,7 @@ def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())) When resources are [created dynamically](source.md#create-resources-dynamically) it is possible to use `dlt.sources.incremental` definition as well. -```python +```py @dlt.source def stripe(): # declare a generator function @@ -523,7 +534,7 @@ result in `IncrementalUnboundError` exception. ### Using Airflow schedule for backfill and incremental loading When [running in Airflow task](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file), you can opt-in your resource to get the `initial_value`/`start_value` and `end_value` from Airflow schedule associated with your DAG. Let's assume that **Zendesk tickets** resource contains a year of data with thousands of tickets. We want to backfill the last year of data week by week and then continue incremental loading daily. -```python +```py @dlt.resource(primary_key="id") def tickets( zendesk_client, @@ -542,7 +553,7 @@ We opt-in to Airflow scheduler by setting `allow_external_schedulers` to `True`: 2. In all other environments, the `incremental` behaves as usual, maintaining `dlt` state. Let's generate a deployment with `dlt deploy zendesk_pipeline.py airflow-composer` and customize the dag: -```python +```py @dag( schedule_interval='@weekly', start_date=pendulum.datetime(2023, 2, 1), @@ -579,7 +590,7 @@ When you enable the DAG in Airflow, it will generate several runs and start exec subsequent weekly intervals starting with `2023-02-12, 00:00:00 UTC` to `2023-02-19, 00:00:00 UTC`. You can repurpose the DAG above to start loading new data incrementally after (or during) the backfill: -```python +```py @dag( schedule_interval='@daily', start_date=pendulum.datetime(2023, 2, 1), @@ -626,7 +637,7 @@ You may force a full refresh of a `merge` and `append` pipelines: Example: -```python +```py p = dlt.pipeline(destination="bigquery", dataset_name="dataset_name") # do a full refresh p.run(merge_source(), write_disposition="replace") @@ -657,7 +668,7 @@ is loaded, the yielded resource data will be loaded at the same time with the up In the two examples below you see how the `dlt.sources.incremental` is working under the hood. -```python +```py @resource() def tweets(): # Get a last value from loaded metadata. If not exist, get None @@ -672,7 +683,7 @@ def tweets(): If we keep a list or a dictionary in the state, we can modify the underlying values in the objects, and thus we do not need to set the state back explicitly. -```python +```py @resource() def tweets(): # Get a last value from loaded metadata. If not exist, get None @@ -710,7 +721,7 @@ data twice - even if the user makes a mistake and requests the same months range In the following example, we initialize a variable with an empty list as a default: -```python +```py @dlt.resource(write_disposition="append") def players_games(chess_url, players, start_month=None, end_month=None): loaded_archives_cache = dlt.current.resource_state().setdefault("archives", []) @@ -736,7 +747,7 @@ def players_games(chess_url, players, start_month=None, end_month=None): ### Advanced state usage: tracking the last value for all search terms in Twitter API -```python +```py @dlt.resource(write_disposition="append") def search_tweets(twitter_bearer_token=dlt.secrets.value, search_terms=None, start_time=None, end_time=None, last_value=None): headers = _headers(twitter_bearer_token) diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md index 095e03e96d..53eca2e59a 100644 --- a/docs/website/docs/general-usage/pipeline.md +++ b/docs/website/docs/general-usage/pipeline.md @@ -15,7 +15,7 @@ Example: This pipeline will load a list of objects into `duckdb` table with a name "three": -```python +```py import dlt pipeline = dlt.pipeline(destination="duckdb", dataset_name="sequence") @@ -53,7 +53,7 @@ Arguments: Example: This pipeline will load the data the generator `generate_rows(10)` produces: -```python +```py import dlt def generate_rows(nr): @@ -110,7 +110,7 @@ pipeline run is progressing. `dlt` supports 4 progress monitors out of the box: You pass the progress monitor in `progress` argument of the pipeline. You can use a name from the list above as in the following example: -```python +```py # create a pipeline loading chess data that dumps # progress to stdout each 10 seconds (the default) pipeline = dlt.pipeline( @@ -123,7 +123,7 @@ pipeline = dlt.pipeline( You can fully configure the progress monitor. See two examples below: -```python +```py # log each minute to Airflow task logger ti = get_current_context()["ti"] pipeline = dlt.pipeline( @@ -134,7 +134,7 @@ pipeline = dlt.pipeline( ) ``` -```python +```py # set tqdm bar color to yellow pipeline = dlt.pipeline( pipeline_name="chess_pipeline", diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index 9b8d45982d..e2e95d937f 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -19,7 +19,7 @@ Commonly used arguments: Example: -```python +```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(): for i in range(10): @@ -32,7 +32,7 @@ def source_name(): To get the data of a resource, we could do: -```python +```py for row in generate_rows(): print(row) @@ -57,7 +57,7 @@ accepts following arguments: `dlt` that column `tags` (containing a list of tags) in `user` table should have type `complex` which means that it will be loaded as JSON/struct and not as child table. - ```python + ```py @dlt.resource(name="user", columns={"tags": {"data_type": "complex"}}) def get_users(): ... @@ -82,7 +82,7 @@ You can alternatively use a [Pydantic](https://pydantic-docs.helpmanual.io/) mod For example: -```python +```py from pydantic import BaseModel @@ -119,7 +119,7 @@ Things to note: You can override this by configuring the Pydantic model -```python +```py from typing import ClassVar from dlt.common.libs.pydantic import DltConfig @@ -146,7 +146,7 @@ argument and the `table_name` string as a return value. For example, a resource that loads GitHub repository events wants to send `issue`, `pull request`, and `comment` events to separate tables. The type of the event is in the "type" field. -```python +```py # send item to a table with name item["type"] @dlt.resource(table_name=lambda event: event['type']) def repo_events() -> Iterator[TDataItems]: @@ -154,13 +154,13 @@ def repo_events() -> Iterator[TDataItems]: # the `table_schema` method gets table schema generated by a resource and takes optional # data item to evaluate dynamic hints -print(repo_events().table_schema({"type": "WatchEvent", id=...})) +print(repo_events().table_schema({"type": "WatchEvent", id:...})) ``` In more advanced cases, you can dispatch data to different tables directly in the code of the resource function: -```python +```py @dlt.resource def repo_events() -> Iterator[TDataItems]: # mark the "item" to be sent to table with name item["type"] @@ -172,7 +172,7 @@ def repo_events() -> Iterator[TDataItems]: You can add arguments to your resource functions like to any other. Below we parametrize our `generate_rows` resource to generate the number of rows we request: -```python +```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(nr): for i in range(nr): @@ -195,7 +195,7 @@ that returns a list of objects (i.e. users) in one endpoint and user details in with this by declaring a resource that obtains a list of users and another resource that receives items from the list and downloads the profiles. -```python +```py @dlt.resource(write_disposition="replace") def users(limit=None): for u in _get_users(limit): @@ -215,7 +215,7 @@ pipeline.run(user_details) ``` In the example above, `user_details` will receive data from default instance of `users` resource (with `limit` set to `None`). You can also use **pipe |** operator to bind resources dynamically -```python +```py # you can be more explicit and use a pipe operator. # with it you can create dynamic pipelines where the dependencies # are set at run time and resources are parametrized i.e. @@ -225,7 +225,7 @@ pipeline.run(users(limit=100) | user_details) :::tip Transformers are allowed not only to **yield** but also to **return** values and can decorate **async** functions and [**async generators**](../reference/performance.md#extract). Below we decorate an async function and request details on two pokemons. Http calls are made in parallel via httpx library. -```python +```py import dlt import httpx @@ -245,7 +245,7 @@ print(list([1,2] | pokemon())) A standalone resource is defined on a function that is top level in a module (not inner function) that accepts config and secrets values. Additionally if `standalone` flag is specified, the decorated function signature and docstring will be preserved. `dlt.resource` will just wrap the decorated function and user must call the wrapper to get the actual resource. Below we declare a `filesystem` resource that must be called before use. -```python +```py @dlt.resource(standalone=True) def filesystem(bucket_url=dlt.config.value): """list and yield files in `bucket_url`""" @@ -256,7 +256,7 @@ pipeline.run(filesystem("s3://my-bucket/reports"), table_name="reports") ``` Standalone may have dynamic name that depends on the arguments passed to the decorated function. For example:: -```python +```py @dlt.resource(standalone=True, name=lambda args: args["stream_name"]) def kinesis(stream_name: str): ... @@ -271,7 +271,7 @@ You can extract multiple resources in parallel threads or with async IO. To enable this for a sync resource you can set the `parallelized` flag to `True` in the resource decorator: -```python +```py @dlt.resource(parallelized=True) def get_users(): for u in _get_users(): @@ -288,7 +288,7 @@ pipeline.run(get_users(), get_orders()) Async generators are automatically extracted concurrently with other resources: -```python +```py @dlt.resource async def get_users(): async for u in _get_users(): # Assuming _get_users is an async generator @@ -317,7 +317,7 @@ so: Here's our resource: -```python +```py import dlt @dlt.resource(write_disposition="replace") @@ -330,7 +330,7 @@ def users(): Here's our script that defines transformations and loads the data: -```python +```py from pipedrive import users def anonymize_user(user_data): @@ -351,7 +351,7 @@ example data and test your transformations etc. In order to do that, you limit h be yielded by a resource by calling `resource.add_limit` method. In the example below we load just 10 first items from and infinite counter - that would otherwise never end. -```python +```py r = dlt.resource(itertools.count(), name="infinity").add_limit(10) assert list(r) == list(range(10)) ``` @@ -375,7 +375,7 @@ that will keep just one updated record per `user_id`. It also adds ["last value" incremental loading](incremental-loading.md#incremental_loading-with-last-value) on `created_at` column to prevent requesting again the already loaded records: -```python +```py tables = sql_database() tables.users.apply_hints( write_disposition="merge", @@ -386,7 +386,7 @@ pipeline.run(tables) ``` To just change a name of a table to which resource will load data, do the following: -```python +```py tables = sql_database() tables.users.table_name = "other_users" ``` @@ -398,7 +398,7 @@ with the existing schema in the same way `apply_hints` method above works. There should avoid lengthy operations (ie. reflecting database tables) during creation of the DAG so it is better do do it when DAG executes. You may also emit partial hints (ie. precision and scale for decimal types) for column to help `dlt` type inference. -```python +```py @dlt.resource def sql_table(credentials, schema, table): # create sql alchemy engine @@ -432,7 +432,7 @@ You can emit columns as Pydantic model and use dynamic hints (ie. lambda for tab ### Duplicate and rename resources There are cases when you your resources are generic (ie. bucket filesystem) and you want to load several instances of it (ie. files from different folders) to separate tables. In example below we use `filesystem` source to load csvs from two different folders into separate tables: -```python +```py @dlt.resource(standalone=True) def filesystem(bucket_url): # list and yield files in bucket_url @@ -463,7 +463,7 @@ You can pass individual resources or list of resources to the `dlt.pipeline` obj loaded outside the source context, will be added to the [default schema](schema.md) of the pipeline. -```python +```py @dlt.resource(name='table_name', write_disposition='replace') def generate_rows(nr): for i in range(nr): @@ -485,6 +485,6 @@ To do a full refresh of an `append` or `merge` resources you temporarily change disposition to replace. You can use `apply_hints` method of a resource or just provide alternative write disposition when loading: -```python +```py p.run(merge_source(), write_disposition="replace") ``` diff --git a/docs/website/docs/general-usage/schema-contracts.md b/docs/website/docs/general-usage/schema-contracts.md index 764b565beb..1b5e67357a 100644 --- a/docs/website/docs/general-usage/schema-contracts.md +++ b/docs/website/docs/general-usage/schema-contracts.md @@ -49,7 +49,7 @@ The `schema_contract` argument accepts two forms: 2. **shorthand** a contract mode (string) that will be applied to all schema entities. For example setting `schema_contract` to *freeze* will expand to the full form: -```python +```py {"tables": "freeze", "columns": "freeze", "data_type": "freeze"} ``` @@ -65,7 +65,7 @@ You can change the contract on the **source** instance via `schema_contract` pro Pydantic models can be used to [define table schemas and validate incoming data](resource.md#define-a-schema-with-pydantic). You can use any model you already have. `dlt` will internally synthesize (if necessary) new models that conform with the **schema contract** on the resource. Just passing a model in `column` argument of the [dlt.resource](resource.md#define-a-schema-with-pydantic) sets a schema contract that conforms to default Pydantic behavior: -```python +```py { "tables": "evolve", "columns": "discard_value", @@ -121,10 +121,10 @@ Here's how `dlt` deals with column modes: When contract is violated in freeze mode, `dlt` raises `DataValidationError` exception. This exception gives access to the full context and passes the evidence to the caller. As with any other exception coming from pipeline run, it will be re-raised via `PipelineStepFailed` exception which you should catch in except: -```python +```py try: pipeline.run() -except as pip_ex: +except Exception as pip_ex: if pip_ex.step == "normalize": if isinstance(pip_ex.__context__.__context__, DataValidationError): ... @@ -195,7 +195,7 @@ def items(): def other_items(): ... -@dlt.source(schema_contract={"columns": "freeze", "data_type": "freeze"}): +@dlt.source(schema_contract={"columns": "freeze", "data_type": "freeze"}) def source(): return [items(), other_items()] diff --git a/docs/website/docs/general-usage/schema-evolution.md b/docs/website/docs/general-usage/schema-evolution.md index 71a6b66521..377df0e47f 100644 --- a/docs/website/docs/general-usage/schema-evolution.md +++ b/docs/website/docs/general-usage/schema-evolution.md @@ -7,14 +7,14 @@ keywords: [schema evolution, schema, dlt schema] # Schema evolution ## When to use schema evolution? -Schema evolution is a best practice when ingesting most data. It’s simply a way to get data across a format barrier. +Schema evolution is a best practice when ingesting most data. It’s simply a way to get data across a format barrier. It separates the technical challenge of “loading” data, from the business challenge of “curating” data. This enables us to have pipelines that are maintainable by different individuals at different stages. However, for cases where schema evolution might be triggered by malicious events, such as in web tracking, data contracts are advised. Read more about how to implement data contracts [here](https://dlthub.com/docs/general-usage/schema-contracts). ## Schema evolution with `dlt` -`dlt` automatically infers the initial schema for your first pipeline run. However, in most cases, the schema tends to change over time, which makes it critical for downstream consumers to adapt to schema changes. +`dlt` automatically infers the initial schema for your first pipeline run. However, in most cases, the schema tends to change over time, which makes it critical for downstream consumers to adapt to schema changes. As the structure of data changes, such as the addition of new columns, changing data types, etc., `dlt` handles these schema changes, enabling you to adapt to changes without losing velocity. @@ -23,11 +23,11 @@ The first run of a pipeline will scan the data that goes through it and generate We’ll review some examples here and figure out how `dlt` creates initial schema and how normalisation works. Consider a pipeline that loads the following schema: -```python +```py data = [{ "organization": "Tech Innovations Inc.", "address": { - 'building': 'r&d', + 'building': 'r&d', "room": 7890, }, "Inventory": [ @@ -62,22 +62,22 @@ Let’s add the following 4 cases: - A column is renamed: a field “building” was renamed to “main_block”. Please update the pipeline for the cases discussed above. -```python +```py data = [{ "organization": "Tech Innovations Inc.", # Column added: - "CEO": "Alice Smith", + "CEO": "Alice Smith", "address": { # 'building' renamed to 'main_block' - 'main_block': 'r&d', + 'main_block': 'r&d', # Removed room column - # "room": 7890, + # "room": 7890, }, "Inventory": [ # Type change: 'inventory_nr' changed to string from int - {"name": "Plasma ray", "inventory nr": "AR2411"}, - {"name": "Self-aware Roomba", "inventory nr": "AR268"}, - {"name": "Type-inferrer", "inventory nr": "AR3621"} + {"name": "Plasma ray", "inventory nr": "AR2411"}, + {"name": "Self-aware Roomba", "inventory nr": "AR268"}, + {"name": "Type-inferrer", "inventory nr": "AR3621"} ] }] @@ -110,7 +110,7 @@ The column lineage can be tracked by loading the 'load_info' to the destination. **Getting notifications** We can read the load outcome and send it to slack webhook with `dlt`. -```python +```py # Import the send_slack_message function from the dlt library from dlt.common.runtime.slack import send_slack_message @@ -123,7 +123,7 @@ for package in info.load_packages: for table_name, table in package.schema_update.items(): # Iterate over each column in the current table for column_name, column in table["columns"].items(): - # Send a message to the Slack channel with the table + # Send a message to the Slack channel with the table # and column update information send_slack_message( hook, @@ -142,16 +142,16 @@ This script sends Slack notifications for schema updates using the `send_slack_m ### How to test for removed columns - applying “not null” constraint -A column not existing, and a column being null, are two different things. However, when it comes to APIs and json, it’s usually all treated the same - the key-value pair will simply not exist. +A column not existing, and a column being null, are two different things. However, when it comes to APIs and json, it’s usually all treated the same - the key-value pair will simply not exist. To remove a column, exclude it from the output of the resource function. Subsequent data inserts will treat this column as null. Verify column removal by applying a not null constraint. For instance, after removing the "room" column, apply a not null constraint to confirm its exclusion. -```python +```py data = [{ "organization": "Tech Innovations Inc.", "address": { - 'building': 'r&d' + 'building': 'r&d' #"room": 7890, }, "Inventory": [ @@ -171,20 +171,20 @@ During pipeline execution a data validation error indicates that a removed colum The data in the pipeline mentioned above is modified. -```python +```py data = [{ "organization": "Tech Innovations Inc.", "CEO": "Alice Smith", "address": {'main_block': 'r&d'}, "Inventory": [ - {"name": "Plasma ray", "inventory nr": "AR2411"}, - {"name": "Self-aware Roomba", "inventory nr": "AR268"}, + {"name": "Plasma ray", "inventory nr": "AR2411"}, + {"name": "Self-aware Roomba", "inventory nr": "AR268"}, { "name": "Type-inferrer", "inventory nr": "AR3621", "details": { - "category": "Computing Devices", + "category": "Computing Devices", "id": 369, - "specifications": [{ + "specifications": [{ "processor": "Quantum Core", "memory": "512PB" }] @@ -201,7 +201,7 @@ The schema of the data above is loaded to the destination as follows: ## What did the schema evolution engine do? -The schema evolution engine in the `dlt` library is designed to handle changes in the structure of your data over time. For example: +The schema evolution engine in the `dlt` library is designed to handle changes in the structure of your data over time. For example: - As above in continuation of the inferred schema, the “specifications” are nested in "details”, which are nested in “Inventory”, all under table name “org”. So the table created for projects is `org__inventory__details__specifications`. @@ -209,6 +209,6 @@ These is a simple examples of how schema evolution works. ## Schema evolution using schema and data contracts -Demonstrating schema evolution without talking about schema and data contracts is only one side of the coin. Schema and data contracts dictate the terms of how the schema being written to destination should evolve. +Demonstrating schema evolution without talking about schema and data contracts is only one side of the coin. Schema and data contracts dictate the terms of how the schema being written to destination should evolve. Schema and data contracts can be applied to entities ‘tables’ , ‘columns’ and ‘data_types’ using contract modes ‘evolve’, freeze’, ‘discard_rows’ and ‘discard_columns’ to tell `dlt` how to apply contract for a particular entity. To read more about **schema and data contracts** read our [documentation](https://dlthub.com/docs/general-usage/schema-contracts). \ No newline at end of file diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 7ce1d959c9..164814010d 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -149,7 +149,7 @@ Now imagine the data has changed and `id` field also contains strings ```py data = [ - {"id": 1, "human_name": "Alice"} + {"id": 1, "human_name": "Alice"}, {"id": "idx-nr-456", "human_name": "Bob"} ] ``` @@ -308,7 +308,7 @@ schema available via `dlt.current.source_schema()`. Example: -```python +```py @dlt.source def textual(nesting_level: int): # get the source schema from the `current` context diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md index 1b3d1ce0cc..bcdd137dce 100644 --- a/docs/website/docs/general-usage/source.md +++ b/docs/website/docs/general-usage/source.md @@ -26,7 +26,7 @@ You declare source by decorating an (optionally async) function that return or y You can create resources by using `dlt.resource` as a function. In an example below we reuse a single generator function to create a list of resources for several Hubspot endpoints. -```python +```py @dlt.source def hubspot(api_key=dlt.secrets.value): @@ -59,7 +59,7 @@ If this is impractical (for example you want to reflect a database to create res You can access resources present in a source and select which of them you want to load. In case of `hubspot` resource above we could select and load "companies", "deals" and "products" resources: -```python +```py from hubspot import hubspot source = hubspot() @@ -73,7 +73,7 @@ pipeline.run(source.with_resources("companies", "deals")) Resources can be individually accessed and selected: -```python +```py # resources are accessible as attributes of a source for c in source.companies: # enumerate all data in companies resource print(c) @@ -89,7 +89,7 @@ source.deals.selected = False You can modify and filter data in resources, for example if we want to keep only deals after certain date: -```python +```py source.deals.add_filter(lambda deal: deal["created_at"] > yesterday) ``` @@ -103,7 +103,7 @@ You can easily get your test dataset in a few minutes, when otherwise you'd need the full loading to complete. Below we limit the `pipedrive` source to just get 10 pages of data from each endpoint. Mind that the transformers will be evaluated fully: -```python +```py from pipedrive import pipedrive_source pipeline = dlt.pipeline(pipeline_name='pipedrive', destination='duckdb', dataset_name='pipedrive_data') @@ -121,7 +121,7 @@ declare a new [transformer that takes the data from](resource.md#feeding-data-from-one-resource-into-another) `deals` resource and add it to the source. -```python +```py import dlt from hubspot import hubspot @@ -140,11 +140,11 @@ source.resources.add(source.deals | deal_scores) pipeline.run(source) ``` You can also set the resources in the source as follows -```python +```py source.deal_scores = source.deals | deal_scores ``` or -```python +```py source.resources["deal_scores"] = source.deals | deal_scores ``` :::note @@ -156,7 +156,7 @@ When adding resource to the source, `dlt` clones the resource so your existing i You can limit how deep `dlt` goes when generating child tables. By default, the library will descend and generate child tables for all nested lists, without limit. -```python +```py @dlt.source(max_table_nesting=1) def mongo_db(): ... @@ -172,7 +172,7 @@ tables of child tables). Typical settings: You can achieve the same effect after the source instance is created: -```python +```py from mongo_db import mongo_db source = mongo_db() @@ -202,7 +202,7 @@ You are also free to decompose a single source into several ones. For example, y down a 50 table copy job into an airflow dag with high parallelism to load the data faster. To do so, you could get the list of resources as: -```python +```py # get a list of resources' names resource_list = sql_source().resources.keys() @@ -216,12 +216,12 @@ for res in resource_list: You can temporarily change the "write disposition" to `replace` on all (or selected) resources within a source to force a full refresh: -```python +```py p.run(merge_source(), write_disposition="replace") ``` With selected resources: -```python +```py p.run(tables.with_resources("users"), write_disposition="replace") ``` diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md index 23625db27c..0ab2b8a658 100644 --- a/docs/website/docs/general-usage/state.md +++ b/docs/website/docs/general-usage/state.md @@ -15,7 +15,7 @@ You read and write the state in your resources. Below we use the state to create game archives which we then use to [prevent requesting duplicates](incremental-loading.md#advanced-state-usage-storing-a-list-of-processed-entities). -```python +```py @dlt.resource(write_disposition="append") def players_games(chess_url, player, start_month=None, end_month=None): # create or request a list of archives from resource scoped state diff --git a/docs/website/docs/getting-started.md b/docs/website/docs/getting-started.md index cd121b0ad5..ecaa78c949 100644 --- a/docs/website/docs/getting-started.md +++ b/docs/website/docs/getting-started.md @@ -20,13 +20,13 @@ Let's get started! Install dlt using `pip`: -```bash +```sh pip install -U dlt ``` The command above installs (or upgrades) the library core, in the example below we use DuckDB as a destination so let's add a `duckdb` dependency: -```bash +```sh pip install "dlt[duckdb]" ``` @@ -63,13 +63,13 @@ When you look at the code above, you can see that we: Save this Python script with the name `quick_start_pipeline.py` and run the following command: -```bash +```sh python quick_start_pipeline.py ``` The output should look like: -```bash +```sh Pipeline quick_start completed in 0.59 seconds 1 load package(s) were loaded to destination duckdb and into dataset mydata The duckdb destination used duckdb:////home/user-name/quick_start/quick_start.duckdb location to store data @@ -82,13 +82,13 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs To allow sneak peek and basic discovery you can take advantage of [built-in integration with Strealmit](reference/command-line-interface#show-tables-and-data-in-the-destination): -```bash +```sh dlt pipeline quick_start show ``` **quick_start** is the name of the pipeline from the script above. If you do not have Streamlit installed yet do: -```bash +```sh pip install streamlit ``` diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index 6df0dad82d..d6d823ad47 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -17,9 +17,9 @@ from various and often messy data sources into well-structured, live datasets. T ```sh pip install dlt ``` -Unlike other solutions, with dlt, there's no need to use any backends or containers. Simply import `dlt` in a Python file or a Jupyter Notebook cell, and create a pipeline to load data into any of the [supported destinations](dlt-ecosystem/destinations/). You can load data from any source that produces Python data structures, including APIs, files, databases, and more. +Unlike other solutions, with dlt, there's no need to use any backends or containers. Simply import `dlt` in a Python file or a Jupyter Notebook cell, and create a pipeline to load data into any of the [supported destinations](dlt-ecosystem/destinations/). You can load data from any source that produces Python data structures, including APIs, files, databases, and more. `dlt` also supports building a [custom destination](dlt-ecosystem/destinations/destination.md), which you can use as reverse ETL. -The library will create or update tables, infer data types and handle nested data automatically. Here are a few example pipelines: +The library will create or update tables, infer data types, and handle nested data automatically. Here are a few example pipelines: ``` This command creates new dlt pipeline script that loads data from `source` to `destination` to it. When you run the command: @@ -26,7 +26,7 @@ version if run again with existing `source` name. You are warned if files will b You can use `--location ` option to specify your own repository with sources. Typically you would [fork ours](https://github.com/dlt-hub/verified-sources) and start customizing and adding sources ie. to use them for your team or organization. You can also specify a branch with `--branch ` ie. to test a version being developed. ### List all verified sources -```shell +```sh dlt init --list-verified-sources ``` Shows all available verified sources and their short descriptions. For each source, checks if your local `dlt` version requires update @@ -43,7 +43,7 @@ that will add additional packages to current environment. ### github-action -```shell +```sh dlt deploy