From 682d9b31a2937cb5d6220a13af0fa80ef637d4fd Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Tue, 19 Sep 2023 22:29:56 +0200 Subject: [PATCH] reorganizes api reference --- dlt/__init__.py | 19 +- .../configuration/specs/base_configuration.py | 2 +- dlt/common/pipeline.py | 52 +- dlt/extract/decorators.py | 18 +- dlt/extract/source.py | 1 - dlt/helpers/streamlit_helper.py | 2 +- dlt/pipeline/__init__.py | 16 +- dlt/pipeline/pipeline.py | 12 +- dlt/sources/helpers/requests/retry.py | 6 +- dlt/sources/helpers/requests/session.py | 2 +- docs/website/.gitignore | 1 + docs/website/docs/.dlt/config.toml | 3 + .../docs/api_reference/__init__/__init__.md | 35 -- docs/website/docs/api_reference/cli/echo.md | 17 - .../docs/api_reference/cli/pipeline_files.md | 20 - .../docs/api_reference/cli/requirements.md | 57 -- .../common/configuration/accessors.md | 85 --- .../common/configuration/container.md | 33 -- .../common/configuration/exceptions.md | 95 ---- .../common/configuration/inject.md | 48 -- .../common/configuration/paths.md | 51 -- .../common/configuration/providers/toml.md | 80 --- .../common/configuration/resolve.md | 43 -- .../configuration/specs/api_credentials.md | 39 -- .../configuration/specs/aws_credentials.md | 53 -- .../configuration/specs/azure_credentials.md | 30 -- .../configuration/specs/base_configuration.md | 213 -------- .../specs/config_providers_context.md | 26 - .../specs/config_section_context.md | 68 --- .../configuration/specs/gcp_credentials.md | 129 ----- .../configuration/specs/known_sections.md | 37 -- .../configuration/specs/run_configuration.md | 38 -- .../common/configuration/utils.md | 28 - .../common/destination/capabilities.md | 16 - .../common/destination/reference.md | 416 --------------- .../docs/api_reference/common/exceptions.md | 81 --- docs/website/docs/api_reference/common/git.md | 43 -- .../api_reference/common/json/__init__.md | 25 - .../docs/api_reference/common/jsonpath.md | 49 -- .../api_reference/common/libs/pydantic.md | 27 - .../common/normalizers/configuration.md | 18 - .../common/normalizers/json/__init__.md | 29 -- .../common/normalizers/json/relational.md | 21 - .../common/normalizers/naming/naming.md | 115 ----- .../common/normalizers/typing.md | 17 - .../api_reference/common/normalizers/utils.md | 36 -- .../docs/api_reference/common/pipeline.md | 335 ------------ .../api_reference/common/reflection/utils.md | 69 --- .../common/runners/configuration.md | 26 - .../api_reference/common/runners/runnable.md | 55 -- .../api_reference/common/runners/stdout.md | 28 - .../common/runners/synth_pickle.md | 15 - .../docs/api_reference/common/runners/venv.md | 120 ----- .../api_reference/common/runtime/collector.md | 159 ------ .../api_reference/common/runtime/exec_info.md | 91 ---- .../api_reference/common/runtime/logger.md | 25 - .../api_reference/common/runtime/segment.md | 46 -- .../api_reference/common/runtime/sentry.md | 16 - .../api_reference/common/runtime/signals.md | 26 - .../api_reference/common/runtime/slack.md | 17 - .../api_reference/common/runtime/telemetry.md | 16 - .../api_reference/common/schema/schema.md | 212 -------- .../api_reference/common/schema/typing.md | 61 --- .../docs/api_reference/common/schema/utils.md | 213 -------- .../docs/api_reference/common/source.md | 45 -- .../common/storages/configuration.md | 113 ---- .../common/storages/file_storage.md | 102 ---- .../common/storages/filesystem.md | 50 -- .../common/storages/load_storage.md | 80 --- .../common/storages/normalize_storage.md | 17 - .../common/storages/transactional_file.md | 180 ------- .../website/docs/api_reference/common/time.md | 87 ---- .../docs/api_reference/common/typing.md | 131 ----- .../docs/api_reference/common/utils.md | 246 --------- .../docs/api_reference/common/validation.md | 42 -- .../destinations/athena/athena.md | 34 -- .../destinations/athena/configuration.md | 18 - .../destinations/bigquery/bigquery.md | 34 -- .../destinations/bigquery/configuration.md | 37 -- .../destinations/bigquery/sql_client.md | 19 - .../destinations/duckdb/configuration.md | 48 -- .../destinations/duckdb/sql_client.md | 19 - .../api_reference/destinations/dummy/dummy.md | 26 - .../destinations/filesystem/configuration.md | 19 - .../destinations/filesystem/filesystem.md | 15 - .../destinations/insert_job_client.md | 35 -- .../destinations/job_client_impl.md | 96 ---- .../api_reference/destinations/job_impl.md | 23 - .../destinations/motherduck/configuration.md | 50 -- .../destinations/mssql/configuration.md | 41 -- .../api_reference/destinations/mssql/mssql.md | 27 - .../api_reference/destinations/path_utils.md | 32 -- .../destinations/postgres/configuration.md | 42 -- .../destinations/redshift/configuration.md | 28 - .../destinations/redshift/redshift.md | 48 -- .../destinations/snowflake/configuration.md | 50 -- .../destinations/snowflake/sql_client.md | 17 - .../api_reference/destinations/sql_client.md | 46 -- .../api_reference/destinations/sql_jobs.md | 99 ---- .../docs/api_reference/destinations/typing.md | 43 -- .../destinations/weaviate/configuration.md | 51 -- .../destinations/weaviate/naming.md | 38 -- .../destinations/weaviate/weaviate_adapter.md | 62 --- .../destinations/weaviate/weaviate_client.md | 195 ------- .../docs/api_reference/extract/decorators.md | 224 -------- .../docs/api_reference/extract/incremental.md | 161 ------ .../docs/api_reference/extract/pipe.md | 204 -------- .../docs/api_reference/extract/schema.md | 72 --- .../docs/api_reference/extract/source.md | 487 ------------------ .../docs/api_reference/extract/typing.md | 38 -- .../docs/api_reference/extract/utils.md | 46 -- .../api_reference/helpers/airflow_helper.md | 115 ----- .../helpers/dbt/configuration.md | 18 - .../docs/api_reference/helpers/dbt/runner.md | 125 ----- .../api_reference/helpers/pandas_helper.md | 50 -- .../api_reference/helpers/streamlit_helper.md | 40 -- .../docs/api_reference/load/configuration.md | 30 -- docs/website/docs/api_reference/load/load.md | 53 -- .../api_reference/normalize/configuration.md | 18 - .../docs/api_reference/normalize/normalize.md | 9 - .../docs/api_reference/pipeline/__init__.md | 164 ------ .../api_reference/pipeline/configuration.md | 30 -- .../docs/api_reference/pipeline/current.md | 15 - .../docs/api_reference/pipeline/dbt.md | 71 --- .../docs/api_reference/pipeline/helpers.md | 28 - .../docs/api_reference/pipeline/pipeline.md | 418 --------------- .../docs/api_reference/pipeline/trace.md | 102 ---- .../docs/api_reference/pipeline/track.md | 18 - .../reflection/script_inspector.md | 28 - docs/website/docs/api_reference/sidebar.json | 362 ------------- .../sources/helpers/requests/__init__.md | 15 - .../sources/helpers/requests/retry.md | 67 --- .../sources/helpers/requests/session.md | 20 - .../sources/helpers/transform.md | 25 - docs/website/docs/api_reference/version.md | 13 - docs/{ => website}/pydoc-markdown.yml | 7 +- docs/website/pydoc_markdown_dlt.py | 25 + 137 files changed, 97 insertions(+), 9168 deletions(-) create mode 100644 docs/website/docs/.dlt/config.toml delete mode 100644 docs/website/docs/api_reference/__init__/__init__.md delete mode 100644 docs/website/docs/api_reference/cli/echo.md delete mode 100644 docs/website/docs/api_reference/cli/pipeline_files.md delete mode 100644 docs/website/docs/api_reference/cli/requirements.md delete mode 100644 docs/website/docs/api_reference/common/configuration/accessors.md delete mode 100644 docs/website/docs/api_reference/common/configuration/container.md delete mode 100644 docs/website/docs/api_reference/common/configuration/exceptions.md delete mode 100644 docs/website/docs/api_reference/common/configuration/inject.md delete mode 100644 docs/website/docs/api_reference/common/configuration/paths.md delete mode 100644 docs/website/docs/api_reference/common/configuration/providers/toml.md delete mode 100644 docs/website/docs/api_reference/common/configuration/resolve.md delete mode 100644 docs/website/docs/api_reference/common/configuration/specs/api_credentials.md delete mode 100644 docs/website/docs/api_reference/common/configuration/specs/aws_credentials.md delete mode 100644 docs/website/docs/api_reference/common/configuration/specs/azure_credentials.md delete mode 100644 docs/website/docs/api_reference/common/configuration/specs/base_configuration.md delete mode 100644 docs/website/docs/api_reference/common/configuration/specs/config_providers_context.md delete mode 100644 docs/website/docs/api_reference/common/configuration/specs/config_section_context.md delete mode 100644 docs/website/docs/api_reference/common/configuration/specs/gcp_credentials.md delete mode 100644 docs/website/docs/api_reference/common/configuration/specs/known_sections.md delete mode 100644 docs/website/docs/api_reference/common/configuration/specs/run_configuration.md delete mode 100644 docs/website/docs/api_reference/common/configuration/utils.md delete mode 100644 docs/website/docs/api_reference/common/destination/capabilities.md delete mode 100644 docs/website/docs/api_reference/common/destination/reference.md delete mode 100644 docs/website/docs/api_reference/common/exceptions.md delete mode 100644 docs/website/docs/api_reference/common/git.md delete mode 100644 docs/website/docs/api_reference/common/json/__init__.md delete mode 100644 docs/website/docs/api_reference/common/jsonpath.md delete mode 100644 docs/website/docs/api_reference/common/libs/pydantic.md delete mode 100644 docs/website/docs/api_reference/common/normalizers/configuration.md delete mode 100644 docs/website/docs/api_reference/common/normalizers/json/__init__.md delete mode 100644 docs/website/docs/api_reference/common/normalizers/json/relational.md delete mode 100644 docs/website/docs/api_reference/common/normalizers/naming/naming.md delete mode 100644 docs/website/docs/api_reference/common/normalizers/typing.md delete mode 100644 docs/website/docs/api_reference/common/normalizers/utils.md delete mode 100644 docs/website/docs/api_reference/common/pipeline.md delete mode 100644 docs/website/docs/api_reference/common/reflection/utils.md delete mode 100644 docs/website/docs/api_reference/common/runners/configuration.md delete mode 100644 docs/website/docs/api_reference/common/runners/runnable.md delete mode 100644 docs/website/docs/api_reference/common/runners/stdout.md delete mode 100644 docs/website/docs/api_reference/common/runners/synth_pickle.md delete mode 100644 docs/website/docs/api_reference/common/runners/venv.md delete mode 100644 docs/website/docs/api_reference/common/runtime/collector.md delete mode 100644 docs/website/docs/api_reference/common/runtime/exec_info.md delete mode 100644 docs/website/docs/api_reference/common/runtime/logger.md delete mode 100644 docs/website/docs/api_reference/common/runtime/segment.md delete mode 100644 docs/website/docs/api_reference/common/runtime/sentry.md delete mode 100644 docs/website/docs/api_reference/common/runtime/signals.md delete mode 100644 docs/website/docs/api_reference/common/runtime/slack.md delete mode 100644 docs/website/docs/api_reference/common/runtime/telemetry.md delete mode 100644 docs/website/docs/api_reference/common/schema/schema.md delete mode 100644 docs/website/docs/api_reference/common/schema/typing.md delete mode 100644 docs/website/docs/api_reference/common/schema/utils.md delete mode 100644 docs/website/docs/api_reference/common/source.md delete mode 100644 docs/website/docs/api_reference/common/storages/configuration.md delete mode 100644 docs/website/docs/api_reference/common/storages/file_storage.md delete mode 100644 docs/website/docs/api_reference/common/storages/filesystem.md delete mode 100644 docs/website/docs/api_reference/common/storages/load_storage.md delete mode 100644 docs/website/docs/api_reference/common/storages/normalize_storage.md delete mode 100644 docs/website/docs/api_reference/common/storages/transactional_file.md delete mode 100644 docs/website/docs/api_reference/common/time.md delete mode 100644 docs/website/docs/api_reference/common/typing.md delete mode 100644 docs/website/docs/api_reference/common/utils.md delete mode 100644 docs/website/docs/api_reference/common/validation.md delete mode 100644 docs/website/docs/api_reference/destinations/athena/athena.md delete mode 100644 docs/website/docs/api_reference/destinations/athena/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/bigquery/bigquery.md delete mode 100644 docs/website/docs/api_reference/destinations/bigquery/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/bigquery/sql_client.md delete mode 100644 docs/website/docs/api_reference/destinations/duckdb/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/duckdb/sql_client.md delete mode 100644 docs/website/docs/api_reference/destinations/dummy/dummy.md delete mode 100644 docs/website/docs/api_reference/destinations/filesystem/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/filesystem/filesystem.md delete mode 100644 docs/website/docs/api_reference/destinations/insert_job_client.md delete mode 100644 docs/website/docs/api_reference/destinations/job_client_impl.md delete mode 100644 docs/website/docs/api_reference/destinations/job_impl.md delete mode 100644 docs/website/docs/api_reference/destinations/motherduck/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/mssql/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/mssql/mssql.md delete mode 100644 docs/website/docs/api_reference/destinations/path_utils.md delete mode 100644 docs/website/docs/api_reference/destinations/postgres/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/redshift/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/redshift/redshift.md delete mode 100644 docs/website/docs/api_reference/destinations/snowflake/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/snowflake/sql_client.md delete mode 100644 docs/website/docs/api_reference/destinations/sql_client.md delete mode 100644 docs/website/docs/api_reference/destinations/sql_jobs.md delete mode 100644 docs/website/docs/api_reference/destinations/typing.md delete mode 100644 docs/website/docs/api_reference/destinations/weaviate/configuration.md delete mode 100644 docs/website/docs/api_reference/destinations/weaviate/naming.md delete mode 100644 docs/website/docs/api_reference/destinations/weaviate/weaviate_adapter.md delete mode 100644 docs/website/docs/api_reference/destinations/weaviate/weaviate_client.md delete mode 100644 docs/website/docs/api_reference/extract/decorators.md delete mode 100644 docs/website/docs/api_reference/extract/incremental.md delete mode 100644 docs/website/docs/api_reference/extract/pipe.md delete mode 100644 docs/website/docs/api_reference/extract/schema.md delete mode 100644 docs/website/docs/api_reference/extract/source.md delete mode 100644 docs/website/docs/api_reference/extract/typing.md delete mode 100644 docs/website/docs/api_reference/extract/utils.md delete mode 100644 docs/website/docs/api_reference/helpers/airflow_helper.md delete mode 100644 docs/website/docs/api_reference/helpers/dbt/configuration.md delete mode 100644 docs/website/docs/api_reference/helpers/dbt/runner.md delete mode 100644 docs/website/docs/api_reference/helpers/pandas_helper.md delete mode 100644 docs/website/docs/api_reference/helpers/streamlit_helper.md delete mode 100644 docs/website/docs/api_reference/load/configuration.md delete mode 100644 docs/website/docs/api_reference/load/load.md delete mode 100644 docs/website/docs/api_reference/normalize/configuration.md delete mode 100644 docs/website/docs/api_reference/normalize/normalize.md delete mode 100644 docs/website/docs/api_reference/pipeline/__init__.md delete mode 100644 docs/website/docs/api_reference/pipeline/configuration.md delete mode 100644 docs/website/docs/api_reference/pipeline/current.md delete mode 100644 docs/website/docs/api_reference/pipeline/dbt.md delete mode 100644 docs/website/docs/api_reference/pipeline/helpers.md delete mode 100644 docs/website/docs/api_reference/pipeline/pipeline.md delete mode 100644 docs/website/docs/api_reference/pipeline/trace.md delete mode 100644 docs/website/docs/api_reference/pipeline/track.md delete mode 100644 docs/website/docs/api_reference/reflection/script_inspector.md delete mode 100644 docs/website/docs/api_reference/sidebar.json delete mode 100644 docs/website/docs/api_reference/sources/helpers/requests/__init__.md delete mode 100644 docs/website/docs/api_reference/sources/helpers/requests/retry.md delete mode 100644 docs/website/docs/api_reference/sources/helpers/requests/session.md delete mode 100644 docs/website/docs/api_reference/sources/helpers/transform.md delete mode 100644 docs/website/docs/api_reference/version.md rename docs/{ => website}/pydoc-markdown.yml (76%) create mode 100644 docs/website/pydoc_markdown_dlt.py diff --git a/dlt/__init__.py b/dlt/__init__.py index 634b537005..c3725af118 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -2,21 +2,20 @@ How to create a data loading pipeline with dlt in 3 seconds: - 1. Write a pipeline script - >>> import dlt - >>> from dlt.sources.helpers import requests - >>> dlt.run(requests.get("https://pokeapi.co/api/v2/pokemon/").json()["results"], destination="duckdb", table_name="pokemon") +1. Write a pipeline script +>>> import dlt +>>> from dlt.sources.helpers import requests +>>> dlt.run(requests.get("https://pokeapi.co/api/v2/pokemon/").json()["results"], destination="duckdb", table_name="pokemon") - 2. Run your pipeline script - $ python pokemon.py - - 3. See and query your data with autogenerated Streamlit app - $ dlt pipeline dlt_pokemon show +2. Run your pipeline script + > $ python pokemon.py +3. See and query your data with autogenerated Streamlit app + > $ dlt pipeline dlt_pokemon show Or start with our pipeline template with sample PokeAPI (pokeapi.co) data loaded to bigquery - $ dlt init pokemon bigquery + > $ dlt init pokemon bigquery For more detailed info, see https://dlthub.com/docs/getting-started """ diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 59168024bf..08940ffe31 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -186,7 +186,7 @@ def parse_native_representation(self, native_value: Any) -> None: """Initialize the configuration fields by parsing the `native_value` which should be a native representation of the configuration or credentials, for example database connection string or JSON serialized GCP service credentials file. - ### Args: + #### Args: native_value (Any): A native representation of the configuration Raises: diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 858f2a0957..a5baf6f637 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -325,7 +325,7 @@ def source_state() -> DictStrAny: """Returns a dictionary with the source-scoped state. Source-scoped state may be shared across the resources of a particular source. Please avoid using source scoped state. Check the `resource_state` function for resource-scoped state that is visible within particular resource. Dlt state is preserved across pipeline runs and may be used to implement incremental loads. - ### Summary + #### Note: The source state is a python dictionary-like object that is available within the `@dlt.source` and `@dlt.resource` decorated functions and may be read and written to. The data within the state is loaded into destination together with any other extracted data and made automatically available to the source/resource extractor functions when they are run next time. When using the state: @@ -374,31 +374,31 @@ def resource_state(resource_name: str = None, source_state_: Optional[DictStrAny Note that this function accepts the resource name as optional argument. There are rare cases when `dlt` is not able to resolve resource name due to requesting function working in different thread than the main. You'll need to pass the name explicitly when you request resource_state from async functions or functions decorated with @defer. - ### Summary - The resource state is a python dictionary-like object that is available within the `@dlt.resource` decorated functions and may be read and written to. - The data within the state is loaded into destination together with any other extracted data and made automatically available to the source/resource extractor functions when they are run next time. - When using the state: - * The resource state is scoped to a particular resource requesting it. - * Any JSON-serializable values can be written and the read from the state. `dlt` dumps and restores instances of Python bytes, DateTime, Date and Decimal types. - * The state available in the resource decorated function is writable and written values will be available on the next pipeline run - - ### Example - The most typical use case for the state is to implement incremental load. - >>> @dlt.resource(write_disposition="append") - >>> def players_games(chess_url, players, start_month=None, end_month=None): - >>> checked_archives = dlt.current.resource_state().setdefault("archives", []) - >>> archives = players_archives(chess_url, players) - >>> for url in archives: - >>> if url in checked_archives: - >>> print(f"skipping archive {url}") - >>> continue - >>> else: - >>> print(f"getting archive {url}") - >>> checked_archives.append(url) - >>> # get the filtered archive - >>> r = requests.get(url) - >>> r.raise_for_status() - >>> yield r.json().get("games", []) + Summary: + The resource state is a python dictionary-like object that is available within the `@dlt.resource` decorated functions and may be read and written to. + The data within the state is loaded into destination together with any other extracted data and made automatically available to the source/resource extractor functions when they are run next time. + When using the state: + * The resource state is scoped to a particular resource requesting it. + * Any JSON-serializable values can be written and the read from the state. `dlt` dumps and restores instances of Python bytes, DateTime, Date and Decimal types. + * The state available in the resource decorated function is writable and written values will be available on the next pipeline run + + Example: + The most typical use case for the state is to implement incremental load. + >>> @dlt.resource(write_disposition="append") + >>> def players_games(chess_url, players, start_month=None, end_month=None): + >>> checked_archives = dlt.current.resource_state().setdefault("archives", []) + >>> archives = players_archives(chess_url, players) + >>> for url in archives: + >>> if url in checked_archives: + >>> print(f"skipping archive {url}") + >>> continue + >>> else: + >>> print(f"getting archive {url}") + >>> checked_archives.append(url) + >>> # get the filtered archive + >>> r = requests.get(url) + >>> r.raise_for_status() + >>> yield r.json().get("games", []) Here we store all the urls with game archives in the state and we skip loading them on next run. The archives are immutable. The state will grow with the coming months (and more players). Up to few thousand archives we should be good though. diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index dd756b1e6b..c6dddf6b19 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -81,13 +81,13 @@ def source( ) -> Any: """A decorator that transforms a function returning one or more `dlt resources` into a `dlt source` in order to load it with `dlt`. - ### Summary + #### Note: A `dlt source` is a logical grouping of resources that are often extracted and loaded together. A source is associated with a schema, which describes the structure of the loaded data and provides instructions how to load it. Such schema contains table schemas that describe the structure of the data coming from the resources. Please refer to https://dlthub.com/docs/general-usage/source for a complete documentation. - ### Passing credentials + #### Credentials: Another important function of the source decorator is to provide credentials and other configuration to the code that extracts data. The decorator may automatically bind the source function arguments to the secret and config values. >>> @dlt.source >>> def chess(username, chess_url: str = dlt.config.value, api_secret = dlt.secrets.value, title: str = "GM"): @@ -98,7 +98,7 @@ def source( Here `username` is a required, explicit python argument, `chess_url` is a required argument, that if not explicitly passed will be taken from configuration ie. `config.toml`, `api_secret` is a required argument, that if not explicitly passed will be taken from dlt secrets ie. `secrets.toml`. See https://dlthub.com/docs/general-usage/credentials for details. - ### Args: + #### Args: func: A function that returns a dlt resource or a list of those or a list of any data items that can be loaded by `dlt`. name (str, optional): A name of the source which is also the name of the associated schema. If not present, the function name will be used. @@ -251,14 +251,14 @@ def resource( ) -> Any: """When used as a decorator, transforms any generator (yielding) function into a `dlt resource`. When used as a function, it transforms data in `data` argument into a `dlt resource`. - ### Summary + #### Note: A `resource`is a location within a `source` that holds the data with specific structure (schema) or coming from specific origin. A resource may be a rest API endpoint, table in the database or a tab in Google Sheets. A `dlt resource` is python representation of a `resource` that combines both data and metadata (table schema) that describes the structure and instructs the loading of the data. A `dlt resource` is also an `Iterable` and can used like any other iterable object ie. list or tuple. Please refer to https://dlthub.com/docs/general-usage/resource for a complete documentation. - ### Passing credentials + #### Credentials: If used as a decorator (`data` argument is a `Generator`), it may automatically bind the source function arguments to the secret and config values. >>> @dlt.resource >>> def user_games(username, chess_url: str = dlt.config.value, api_secret = dlt.secrets.value): @@ -270,7 +270,7 @@ def resource( See https://dlthub.com/docs/general-usage/credentials for details. Note that if decorated function is an inner function, passing of the credentials will be disabled. - ### Args: + #### Args: data (Callable | Any, optional): a function to be decorated or a data compatible with `dlt` `run`. name (str, optional): A name of the resource that by default also becomes the name of the table to which the data is loaded. @@ -297,7 +297,7 @@ def resource( depends_on (TUnboundDltResource, optional): Allows to pipe data from one resource to another to build multi-step pipelines. - ### Raises + Raises: ResourceNameMissing: indicates that name of the resource cannot be inferred from the `data` being passed. InvalidResourceDataType: indicates that the `data` argument cannot be converted into `dlt resource` @@ -428,7 +428,7 @@ def transformer( # type: ignore You can bind the transformer early by specifying resource in `data_from` when the transformer is created or create dynamic bindings later with | operator which is demonstrated in example below: - ### Example + Example: >>> @dlt.resource >>> def players(title, chess_url=dlt.config.value): >>> r = requests.get(f"{chess_url}titled/{title}") @@ -444,7 +444,7 @@ def transformer( # type: ignore >>> # pipes the data from players into player profile to produce a list of player profiles >>> list(players("GM") | player_profile) - ### Args: + Args: f: (Callable): a function taking minimum one argument of TDataItems type which will receive data yielded from `data_from` resource. data_from (Callable | Any, optional): a resource that will send data to the decorated function `f` diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 52a0381dfe..3f4104aa36 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -553,7 +553,6 @@ def __delitem__(self, resource_name: str) -> None: class DltSource(Iterable[TDataItem]): """Groups several `dlt resources` under a single schema and allows to perform operations on them. - ### Summary The instance of this class is created whenever you call the `dlt.source` decorated function. It automates several functions for you: * You can pass this instance to `dlt` `run` method in order to load all data present in the `dlt resources`. * You can select and deselect resources that you want to load via `with_resources` method diff --git a/dlt/helpers/streamlit_helper.py b/dlt/helpers/streamlit_helper.py index 05dadf67b5..52584996cf 100644 --- a/dlt/helpers/streamlit_helper.py +++ b/dlt/helpers/streamlit_helper.py @@ -205,7 +205,7 @@ def _query_data_live(query: str, schema_name: str = None) -> pd.DataFrame: def write_data_explorer_page(pipeline: Pipeline, schema_name: str = None, show_dlt_tables: bool = False, example_query: str = "", show_charts: bool = True) -> None: """Writes Streamlit app page with a schema and live data preview. - ### Args: + #### Args: pipeline (Pipeline): Pipeline instance to use. schema_name (str, optional): Name of the schema to display. If None, default schema is used. show_dlt_tables (bool, optional): Should show DLT internal tables. Defaults to False. diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index df4314cf0d..b6539c1b78 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -32,7 +32,7 @@ def pipeline( ) -> Pipeline: """Creates a new instance of `dlt` pipeline, which moves the data from the source ie. a REST API to a destination ie. database or a data lake. - ### Summary + #### Note: The `pipeline` functions allows you to pass the destination name to which the data should be loaded, the name of the dataset and several other options that govern loading of the data. The created `Pipeline` object lets you load the data from any source with `run` method or to have more granular control over the loading process with `extract`, `normalize` and `load` methods. @@ -41,7 +41,7 @@ def pipeline( - Pipeline architecture and data loading steps: https://dlthub.com/docs/reference - List of supported destinations: https://dlthub.com/docs/dlt-ecosystem/destinations - ### Args: + #### Args: pipeline_name (str, optional): A name of the pipeline that will be used to identify it in monitoring events and to restore its state and data schemas on subsequent runs. Defaults to the file name of pipeline script with `dlt_` prefix added. @@ -73,7 +73,7 @@ def pipeline( `extract`, `normalize` and `load` stage. Pass a string with a collector name or configure your own by choosing from `dlt.progress` module. We support most of the progress libraries: try passing `tqdm`, `enlighten` or `alive_progress` or `log` to write to console/log. - ### Returns: + #### Returns: Pipeline: An instance of `Pipeline` class with. Please check the documentation of `run` method for information on what to do with it. """ @@ -179,7 +179,7 @@ def run( ) -> LoadInfo: """Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. - ### Summary + #### Note: This method will `extract` the data from the `data` argument, infer the schema, `normalize` the data into a load package (ie. jsonl or PARQUET files representing tables) and then `load` such packages into the `destination`. The data may be supplied in several forms: @@ -190,12 +190,12 @@ def run( Please note that `dlt` deals with `bytes`, `datetime`, `decimal` and `uuid` objects so you are free to load binary data or documents containing dates. - ### Execution + #### Execution: The `run` method will first use `sync_destination` method to synchronize pipeline state and schemas with the destination. You can disable this behavior with `restore_from_destination` configuration option. Next it will make sure that data from the previous is fully processed. If not, `run` method normalizes and loads pending data items. Only then the new data from `data` argument is extracted, normalized and loaded. - ### Args: + #### Args: data (Any): Data to be loaded to destination destination (str | DestinationReference, optional): A name of the destination to which dlt will load the data, or a destination module imported from `dlt.destination`. @@ -220,9 +220,9 @@ def run( schema (Schema, optional): An explicit `Schema` object in which all table schemas will be grouped. By default `dlt` takes the schema from the source (if passed in `data` argument) or creates a default one itself. - ### Raises: + Raises: PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. - ### Returns: + Returns: LoadInfo: Information on loaded data including the list of package ids and failed job statuses. Please not that `dlt` will not raise if a single job terminally fails. Such information is provided via LoadInfo. """ destination = DestinationReference.from_name(destination) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 98eb6d408f..080d19d892 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -393,7 +393,7 @@ def run( ) -> LoadInfo: """Loads the data from `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. - ### Summary + #### Note: This method will `extract` the data from the `data` argument, infer the schema, `normalize` the data into a load package (ie. jsonl or PARQUET files representing tables) and then `load` such packages into the `destination`. The data may be supplied in several forms: @@ -404,12 +404,12 @@ def run( Please note that `dlt` deals with `bytes`, `datetime`, `decimal` and `uuid` objects so you are free to load documents containing ie. binary data or dates. - ### Execution + #### Execution: The `run` method will first use `sync_destination` method to synchronize pipeline state and schemas with the destination. You can disable this behavior with `restore_from_destination` configuration option. Next it will make sure that data from the previous is fully processed. If not, `run` method normalizes, loads pending data items and **exits** If there was no pending data, new data from `data` argument is extracted, normalized and loaded. - ### Args: + #### Args: data (Any): Data to be loaded to destination destination (str | DestinationReference, optional): A name of the destination to which dlt will load the data, or a destination module imported from `dlt.destination`. @@ -439,9 +439,9 @@ def run( loader_file_format (Literal["jsonl", "insert_values", "parquet"], optional). The file format the loader will use to create the load package. Not all file_formats are compatible with all destinations. Defaults to the preferred file format of the selected destination. - ### Raises: + Raises: PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. - ### Returns: + Returns: LoadInfo: Information on loaded data including the list of package ids and failed job statuses. Please not that `dlt` will not raise if a single job terminally fails. Such information is provided via LoadInfo. """ signals.raise_if_signalled() @@ -475,7 +475,7 @@ def run( def sync_destination(self, destination: TDestinationReferenceArg = None, staging: TDestinationReferenceArg = None, dataset_name: str = None) -> None: """Synchronizes pipeline state with the `destination`'s state kept in `dataset_name` - ### Summary + #### Note: Attempts to restore pipeline state and schemas from the destination. Requires the state that is present at the destination to have a higher version number that state kept locally in working directory. In such a situation the local state, schemas and intermediate files with the data will be deleted and replaced with the state and schema present in the destination. diff --git a/dlt/sources/helpers/requests/retry.py b/dlt/sources/helpers/requests/retry.py index d1e7a1a7f3..8f824e0c4f 100644 --- a/dlt/sources/helpers/requests/retry.py +++ b/dlt/sources/helpers/requests/retry.py @@ -116,11 +116,11 @@ def _make_retry( class Client: """Wrapper for `requests` to create a `Session` with configurable retry functionality. - ### Summary + #### Note: Create a `requests.Session` which automatically retries requests in case of error. By default retries are triggered for `5xx` and `429` status codes and when the server is unreachable or drops connection. - ### Custom retry condition + #### Custom retry condition You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception. For example, this will trigger a retry when the response text is `error`: @@ -134,7 +134,7 @@ class Client: The retry is triggered when either any of the predicates or the default conditions based on status code/exception are `True`. - ### Args: + #### Args: request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds. max_connections: Max connections per host in the HTTPAdapter pool raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`) diff --git a/dlt/sources/helpers/requests/session.py b/dlt/sources/helpers/requests/session.py index e23361a3ab..b12d8da73f 100644 --- a/dlt/sources/helpers/requests/session.py +++ b/dlt/sources/helpers/requests/session.py @@ -21,7 +21,7 @@ def _timeout_to_seconds(timeout: TRequestTimeout) -> Optional[Union[Tuple[float, class Session(BaseSession): """Requests session which by default adds a timeout to all requests and calls `raise_for_status()` on response - ### Args + #### Args: timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds. May be a single value or a tuple for separate (connect, read) timeout. raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`) diff --git a/docs/website/.gitignore b/docs/website/.gitignore index b2d6de3062..fe0be8784d 100644 --- a/docs/website/.gitignore +++ b/docs/website/.gitignore @@ -7,6 +7,7 @@ # Generated files .docusaurus .cache-loader +docs/api_reference # Misc .DS_Store diff --git a/docs/website/docs/.dlt/config.toml b/docs/website/docs/.dlt/config.toml new file mode 100644 index 0000000000..d38b85c1a1 --- /dev/null +++ b/docs/website/docs/.dlt/config.toml @@ -0,0 +1,3 @@ +[destination.weaviate] +vectorizer="text2vec-contextionary" +module_config={text2vec-contextionary = { vectorizeClassName = false, vectorizePropertyName = true}} \ No newline at end of file diff --git a/docs/website/docs/api_reference/__init__/__init__.md b/docs/website/docs/api_reference/__init__/__init__.md deleted file mode 100644 index 0915aa965a..0000000000 --- a/docs/website/docs/api_reference/__init__/__init__.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -sidebar_label: __init__ -title: __init__ ---- - -data load tool (dlt) — the open-source Python library for data loading - -How to create a data loading pipeline with dlt in 3 seconds: - - 1. Write a pipeline script - >>> import dlt - >>> from dlt.sources.helpers import requests - >>> dlt.run(requests.get("https://pokeapi.co/api/v2/pokemon/").json()["results"], destination="duckdb", table_name="pokemon") - - 2. Run your pipeline script - $ python pokemon.py - - 3. See and query your data with autogenerated Streamlit app - $ dlt pipeline dlt_pokemon show - - -Or start with our pipeline template with sample PokeAPI (pokeapi.co) data loaded to bigquery - - $ dlt init pokemon bigquery - -For more detailed info, see https://dlthub.com/docs/getting-started - -#### TSecretValue - -When typing source/resource function arguments it indicates that a given argument is a secret and should be taken from dlt.secrets. - -#### TCredentials - -When typing source/resource function arguments it indicates that a given argument represents credentials and should be taken from dlt.secrets. Credentials may be a string, dictionary or any other type. - diff --git a/docs/website/docs/api_reference/cli/echo.md b/docs/website/docs/api_reference/cli/echo.md deleted file mode 100644 index 04185a6c37..0000000000 --- a/docs/website/docs/api_reference/cli/echo.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -sidebar_label: echo -title: cli.echo ---- - -#### always\_choose - -```python -@contextlib.contextmanager -def always_choose(always_choose_default: bool, - always_choose_value: Any) -> Iterator[None] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/cli/echo.py#L11) - -Temporarily answer all confirmations and prompts with the values specified in arguments - diff --git a/docs/website/docs/api_reference/cli/pipeline_files.md b/docs/website/docs/api_reference/cli/pipeline_files.md deleted file mode 100644 index cbb5fe5a5a..0000000000 --- a/docs/website/docs/api_reference/cli/pipeline_files.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -sidebar_label: pipeline_files -title: cli.pipeline_files ---- - -#### find\_conflict\_files - -```python -def find_conflict_files( - local_index: TVerifiedSourceFileIndex, - remote_new: Dict[str, TVerifiedSourceFileEntry], - remote_modified: Dict[str, TVerifiedSourceFileEntry], - remote_deleted: Dict[str, TVerifiedSourceFileEntry], - dest_storage: FileStorage) -> Tuple[List[str], List[str]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/cli/pipeline_files.py#L221) - -Use files index from .sources to identify modified files via sha3 content hash - diff --git a/docs/website/docs/api_reference/cli/requirements.md b/docs/website/docs/api_reference/cli/requirements.md deleted file mode 100644 index 3bf96e6f90..0000000000 --- a/docs/website/docs/api_reference/cli/requirements.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -sidebar_label: requirements -title: cli.requirements ---- - -## SourceRequirements Objects - -```python -class SourceRequirements() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/cli/requirements.py#L8) - -Helper class to parse and manipulate entries in source's requirements.txt - -#### dlt\_requirement - -Final dlt requirement that may be updated with destination extras - -#### dlt\_requirement\_base - -Original dlt requirement without extras - -#### from\_string - -```python -@classmethod -def from_string(cls, requirements: str) -> "SourceRequirements" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/cli/requirements.py#L23) - -Initialize from requirements.txt string, one dependency per line - -#### update\_dlt\_extras - -```python -def update_dlt_extras(destination_name: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/cli/requirements.py#L36) - -Update the dlt requirement to include destination - -#### is\_installed\_dlt\_compatible - -```python -def is_installed_dlt_compatible() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/cli/requirements.py#L48) - -Check whether currently installed version is compatible with dlt requirement - -For example, requirements.txt of the source may specify dlt>=0.3.5,<0.4.0 -and we check whether the installed dlt version (e.g. 0.3.6) falls within this range. - diff --git a/docs/website/docs/api_reference/common/configuration/accessors.md b/docs/website/docs/api_reference/common/configuration/accessors.md deleted file mode 100644 index a2533d8a73..0000000000 --- a/docs/website/docs/api_reference/common/configuration/accessors.md +++ /dev/null @@ -1,85 +0,0 @@ ---- -sidebar_label: accessors -title: common.configuration.accessors ---- - -## \_ConfigAccessor Objects - -```python -class _ConfigAccessor(_Accessor) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/accessors.py#L88) - -Provides direct access to configured values that are not secrets. - -#### config\_providers - -```python -@property -def config_providers() -> Sequence[ConfigProvider] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/accessors.py#L92) - -Return a list of config providers, in lookup order - -#### writable\_provider - -```python -@property -def writable_provider() -> ConfigProvider -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/accessors.py#L101) - -find first writable provider that does not support secrets - should be config.toml - -#### value - -A placeholder that tells dlt to replace it with actual config value during the call to a source or resource decorated function. - -## \_SecretsAccessor Objects - -```python -class _SecretsAccessor(_Accessor) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/accessors.py#L109) - -Provides direct access to secrets. - -#### config\_providers - -```python -@property -def config_providers() -> Sequence[ConfigProvider] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/accessors.py#L113) - -Return a list of config providers that can hold secrets, in lookup order - -#### writable\_provider - -```python -@property -def writable_provider() -> ConfigProvider -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/accessors.py#L122) - -find first writable provider that supports secrets - should be secrets.toml - -#### value - -A placeholder that tells dlt to replace it with actual secret during the call to a source or resource decorated function. - -#### config - -Dictionary-like access to all config values to dlt - -#### secrets - -Dictionary-like access to all secrets known known to dlt - diff --git a/docs/website/docs/api_reference/common/configuration/container.md b/docs/website/docs/api_reference/common/configuration/container.md deleted file mode 100644 index 4b7c953240..0000000000 --- a/docs/website/docs/api_reference/common/configuration/container.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -sidebar_label: container -title: common.configuration.container ---- - -## Container Objects - -```python -class Container() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/container.py#L10) - -A singleton injection container holding several injection contexts. Implements basic dictionary interface. - -Injection context is identified by its type and available via dict indexer. The common pattern is to instantiate default context value -if it is not yet present in container. - -The indexer is settable and allows to explicitly set the value. This is required by for context that needs to be explicitly instantiated. - -The `injectable_context` allows to set a context with a `with` keyword and then restore the previous one after it gets out of scope. - -#### injectable\_context - -```python -@contextmanager -def injectable_context(config: TConfiguration) -> Iterator[TConfiguration] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/container.py#L65) - -A context manager that will insert `config` into the container and restore the previous value when it gets out of scope. - diff --git a/docs/website/docs/api_reference/common/configuration/exceptions.md b/docs/website/docs/api_reference/common/configuration/exceptions.md deleted file mode 100644 index 0f697a158d..0000000000 --- a/docs/website/docs/api_reference/common/configuration/exceptions.md +++ /dev/null @@ -1,95 +0,0 @@ ---- -sidebar_label: exceptions -title: common.configuration.exceptions ---- - -## ContainerException Objects - -```python -class ContainerException(DltException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/exceptions.py#L21) - -base exception for all exceptions related to injectable container - -## ConfigProviderException Objects - -```python -class ConfigProviderException(ConfigurationException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/exceptions.py#L26) - -base exceptions for all exceptions raised by config providers - -## ConfigFieldMissingException Objects - -```python -class ConfigFieldMissingException(KeyError, ConfigurationException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/exceptions.py#L36) - -raises when not all required config fields are present - -## UnmatchedConfigHintResolversException Objects - -```python -class UnmatchedConfigHintResolversException(ConfigurationException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/exceptions.py#L55) - -Raised when using `@resolve_type` on a field that doesn't exist in the spec - -## FinalConfigFieldException Objects - -```python -class FinalConfigFieldException(ConfigurationException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/exceptions.py#L70) - -rises when field was annotated as final ie Final[str] and the value is modified by config provider - -## ConfigValueCannotBeCoercedException Objects - -```python -class ConfigValueCannotBeCoercedException(ConfigurationValueError) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/exceptions.py#L76) - -raises when value returned by config provider cannot be coerced to hinted type - -## ConfigFileNotFoundException Objects - -```python -class ConfigFileNotFoundException(ConfigurationException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/exceptions.py#L96) - -thrown when configuration file cannot be found in config folder - -## ConfigFieldMissingTypeHintException Objects - -```python -class ConfigFieldMissingTypeHintException(ConfigurationException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/exceptions.py#L103) - -thrown when configuration specification does not have type hint - -## ConfigFieldTypeHintNotSupported Objects - -```python -class ConfigFieldTypeHintNotSupported(ConfigurationException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/exceptions.py#L112) - -thrown when configuration specification uses not supported type in hint - diff --git a/docs/website/docs/api_reference/common/configuration/inject.md b/docs/website/docs/api_reference/common/configuration/inject.md deleted file mode 100644 index f4bf672f56..0000000000 --- a/docs/website/docs/api_reference/common/configuration/inject.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -sidebar_label: inject -title: common.configuration.inject ---- - -#### with\_config - -```python -def with_config(func: Optional[AnyFun] = None, - spec: Type[BaseConfiguration] = None, - sections: Tuple[str, ...] = (), - sections_merge_style: ConfigSectionContext. - TMergeFunc = ConfigSectionContext.prefer_incoming, - auto_pipeline_section: bool = False, - include_defaults: bool = True) -> Callable[[TFun], TFun] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/inject.py#L53) - -Injects values into decorated function arguments following the specification in `spec` or by deriving one from function's signature. - -The synthesized spec contains the arguments marked with `dlt.secrets.value` and `dlt.config.value` which are required to be injected at runtime. -Optionally (and by default) arguments with default values are included in spec as well. - -**Arguments**: - -- `func` _Optional[AnyFun], optional_ - A function with arguments to be injected. Defaults to None. -- `spec` _Type[BaseConfiguration], optional_ - A specification of injectable arguments. Defaults to None. -- `sections` _Tuple[str, ...], optional_ - A set of config sections in which to look for arguments values. Defaults to (). -- `prefer_existing_sections` - (bool, optional): When joining existing section context, the existing context will be preferred to the one in `sections`. Default: False -- `auto_pipeline_section` _bool, optional_ - If True, a top level pipeline section will be added if `pipeline_name` argument is present . Defaults to False. -- `include_defaults` _bool, optional_ - If True then arguments with default values will be included in synthesized spec. If False only the required arguments marked with `dlt.secrets.value` and `dlt.config.value` are included - - -**Returns**: - - Callable[[TFun], TFun]: A decorated function - -#### last\_config - -```python -def last_config(**kwargs: Any) -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/inject.py#L178) - -Get configuration instance used to inject function arguments - diff --git a/docs/website/docs/api_reference/common/configuration/paths.md b/docs/website/docs/api_reference/common/configuration/paths.md deleted file mode 100644 index 4ac4fa89ec..0000000000 --- a/docs/website/docs/api_reference/common/configuration/paths.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -sidebar_label: paths -title: common.configuration.paths ---- - -#### get\_dlt\_project\_dir - -```python -def get_dlt_project_dir() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/paths.py#L11) - -The dlt project dir is the current working directory but may be overridden by DLT_PROJECT_DIR env variable. - -#### get\_dlt\_settings\_dir - -```python -def get_dlt_settings_dir() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/paths.py#L16) - -Returns a path to dlt settings directory. If not overridden it resides in current working directory - -The name of the setting folder is '.dlt'. The path is current working directory '.' but may be overridden by DLT_PROJECT_DIR env variable. - -#### make\_dlt\_settings\_path - -```python -def make_dlt_settings_path(path: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/paths.py#L24) - -Returns path to file in dlt settings folder. - -#### get\_dlt\_data\_dir - -```python -def get_dlt_data_dir() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/paths.py#L29) - -Gets default directory where pipelines' data will be stored -1. in user home directory: ~/.dlt/ -2. if current user is root: in /var/dlt/ -3. if current user does not have a home directory: in /tmp/dlt/ -4. if DLT_DATA_DIR is set in env then it is used - diff --git a/docs/website/docs/api_reference/common/configuration/providers/toml.md b/docs/website/docs/api_reference/common/configuration/providers/toml.md deleted file mode 100644 index bd950390e9..0000000000 --- a/docs/website/docs/api_reference/common/configuration/providers/toml.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -sidebar_label: toml -title: common.configuration.providers.toml ---- - -## VaultTomlProvider Objects - -```python -class VaultTomlProvider(BaseTomlProvider) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/providers/toml.py#L111) - -A toml-backed Vault abstract config provider. - -This provider allows implementation of providers that store secrets in external vaults: like Hashicorp, Google Secrets or Airflow Metadata. -The basic working principle is obtain config and secrets values from Vault keys and reconstitute a `secrets.toml` like document that is then used -as a cache. - -The implemented must provide `_look_vault` method that returns a value from external vault from external key. - -To reduce number of calls to external vaults the provider is searching for a known configuration fragments which should be toml documents and merging -them with the -- only keys with secret type hint (CredentialsConfiguration, TSecretValue) will be looked up by default. -- provider gathers `toml` document fragments that contain source and destination credentials in path specified below -- single values will not be retrieved, only toml fragments by default - -#### \_\_init\_\_ - -```python -def __init__(only_secrets: bool, only_toml_fragments: bool) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/providers/toml.py#L128) - -Initializes the toml backed Vault provider by loading a toml fragment from `dlt_secrets_toml` key and using it as initial configuration. - -_extended_summary_ - -**Arguments**: - -- `only_secrets` _bool_ - Only looks for secret values (CredentialsConfiguration, TSecretValue) by returning None (not found) -- `only_toml_fragments` _bool_ - Only load the known toml fragments and ignore any other lookups by returning None (not found) - -## TomlFileProvider Objects - -```python -class TomlFileProvider(BaseTomlProvider) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/providers/toml.py#L219) - -#### \_\_init\_\_ - -```python -def __init__(file_name: str, - project_dir: str = None, - add_global_config: bool = False) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/providers/toml.py#L220) - -Creates config provider from a `toml` file - -The provider loads the `toml` file with specified name and from specified folder. If `add_global_config` flags is specified, -it will look for `file_name` in `dlt` home dir. The "project" (`project_dir`) values overwrite the "global" values. - -If none of the files exist, an empty provider is created. - -**Arguments**: - -- `file_name` _str_ - The name of `toml` file to load -- `project_dir` _str, optional_ - The location of `file_name`. If not specified, defaults to $cwd/.dlt -- `add_global_config` _bool, optional_ - Looks for `file_name` in `dlt` home directory which in most cases is $HOME/.dlt - - -**Raises**: - -- `TomlProviderReadException` - File could not be read, most probably `toml` parsing error - diff --git a/docs/website/docs/api_reference/common/configuration/resolve.md b/docs/website/docs/api_reference/common/configuration/resolve.md deleted file mode 100644 index bc66a02ed5..0000000000 --- a/docs/website/docs/api_reference/common/configuration/resolve.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -sidebar_label: resolve -title: common.configuration.resolve ---- - -#### initialize\_credentials - -```python -def initialize_credentials(hint: Any, - initial_value: Any) -> CredentialsConfiguration -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/resolve.py#L39) - -Instantiate credentials of type `hint` with `initial_value`. The initial value must be a native representation (typically string) -or a dictionary corresponding to credential's fields. In case of union of credentials, the first configuration in the union fully resolved by -initial value will be instantiated. - -#### inject\_section - -```python -def inject_section( - section_context: ConfigSectionContext, - merge_existing: bool = True) -> ContextManager[ConfigSectionContext] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/resolve.py#L66) - -Context manager that sets section specified in `section_context` to be used during configuration resolution. Optionally merges the context already in the container with the one provided - -**Arguments**: - -- `section_context` _ConfigSectionContext_ - Instance providing a pipeline name and section context -- `merge_existing` _bool, optional_ - Merges existing section context with `section_context` in the arguments by executing `merge_style` function on `section_context`. Defaults to True. - - Default Merge Style: - Gets `pipeline_name` and `sections` from existing context if they are not provided in `section_context` argument. - - -**Yields**: - -- `Iterator[ConfigSectionContext]` - Context manager with current section context - diff --git a/docs/website/docs/api_reference/common/configuration/specs/api_credentials.md b/docs/website/docs/api_reference/common/configuration/specs/api_credentials.md deleted file mode 100644 index 94d8406117..0000000000 --- a/docs/website/docs/api_reference/common/configuration/specs/api_credentials.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -sidebar_label: api_credentials -title: common.configuration.specs.api_credentials ---- - -## OAuth2Credentials Objects - -```python -@configspec -class OAuth2Credentials(CredentialsConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/api_credentials.py#L8) - -#### token - -Access token - -#### auth - -```python -def auth(scopes: Union[str, List[str]] = None, - redirect_url: str = None) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/api_credentials.py#L21) - -Authorizes the client using the available credentials - -Uses the `refresh_token` grant if refresh token is available. Note that `scopes` and `redirect_url` are ignored in this flow. -Otherwise obtains refresh_token via web flow and authorization code grant. - -Sets `token` and `access_token` fields in the credentials on successful authorization. - -**Arguments**: - -- `scopes` _Union[str, List[str]], optional_ - Additional scopes to add to configured scopes. To be used in web flow. Defaults to None. -- `redirect_url` _str, optional_ - Redirect url in case of web flow. Defaults to None. - diff --git a/docs/website/docs/api_reference/common/configuration/specs/aws_credentials.md b/docs/website/docs/api_reference/common/configuration/specs/aws_credentials.md deleted file mode 100644 index 4effb63c72..0000000000 --- a/docs/website/docs/api_reference/common/configuration/specs/aws_credentials.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -sidebar_label: aws_credentials -title: common.configuration.specs.aws_credentials ---- - -## AwsCredentialsWithoutDefaults Objects - -```python -@configspec -class AwsCredentialsWithoutDefaults(CredentialsConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/aws_credentials.py#L11) - -#### to\_s3fs\_credentials - -```python -def to_s3fs_credentials() -> Dict[str, Optional[str]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/aws_credentials.py#L19) - -Dict of keyword arguments that can be passed to s3fs - -#### to\_native\_representation - -```python -def to_native_representation() -> Dict[str, Optional[str]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/aws_credentials.py#L28) - -Return a dict that can be passed as kwargs to boto3 session - -## AwsCredentials Objects - -```python -@configspec -class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/aws_credentials.py#L34) - -#### parse\_native\_representation - -```python -def parse_native_representation(native_value: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/aws_credentials.py#L82) - -Import external boto3 session - diff --git a/docs/website/docs/api_reference/common/configuration/specs/azure_credentials.md b/docs/website/docs/api_reference/common/configuration/specs/azure_credentials.md deleted file mode 100644 index ab22393dc9..0000000000 --- a/docs/website/docs/api_reference/common/configuration/specs/azure_credentials.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -sidebar_label: azure_credentials -title: common.configuration.specs.azure_credentials ---- - -## AzureCredentialsWithoutDefaults Objects - -```python -@configspec -class AzureCredentialsWithoutDefaults(CredentialsConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/azure_credentials.py#L14) - -Credentials for azure blob storage, compatible with adlfs - -#### azure\_sas\_token\_permissions - -Permissions to use when generating a SAS token. Ignored when sas token is provided directly - -#### to\_adlfs\_credentials - -```python -def to_adlfs_credentials() -> Dict[str, Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/azure_credentials.py#L23) - -Return a dict that can be passed as kwargs to adlfs - diff --git a/docs/website/docs/api_reference/common/configuration/specs/base_configuration.md b/docs/website/docs/api_reference/common/configuration/specs/base_configuration.md deleted file mode 100644 index fce3cead3f..0000000000 --- a/docs/website/docs/api_reference/common/configuration/specs/base_configuration.md +++ /dev/null @@ -1,213 +0,0 @@ ---- -sidebar_label: base_configuration -title: common.configuration.specs.base_configuration ---- - -#### configspec - -```python -def configspec( - cls: Optional[Type[Any]] = None -) -> Union[Type[TAnyClass], Callable[[Type[TAnyClass]], Type[TAnyClass]]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L94) - -Converts (via derivation) any decorated class to a Python dataclass that may be used as a spec to resolve configurations - -In comparison the Python dataclass, a spec implements full dictionary interface for its attributes, allows instance creation from ie. strings -or other types (parsing, deserialization) and control over configuration resolution process. See `BaseConfiguration` and CredentialsConfiguration` for -more information. - -## BaseConfiguration Objects - -```python -@configspec -class BaseConfiguration(MutableMapping[str, Any]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L170) - -#### \_\_is\_resolved\_\_ - -True when all config fields were resolved and have a specified value type - -#### \_\_section\_\_ - -Obligatory section used by config providers when searching for keys, always present in the search path - -#### \_\_exception\_\_ - -Holds the exception that prevented the full resolution - -#### \_\_config\_gen\_annotations\_\_ - -Additional annotations for config generator, currently holds a list of fields of interest that have defaults - -#### \_\_dataclass\_fields\_\_ - -Typing for dataclass fields - -#### parse\_native\_representation - -```python -def parse_native_representation(native_value: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L185) - -Initialize the configuration fields by parsing the `native_value` which should be a native representation of the configuration -or credentials, for example database connection string or JSON serialized GCP service credentials file. - -### Args: -native_value (Any): A native representation of the configuration - -**Raises**: - -- `NotImplementedError` - This configuration does not have a native representation -- `ValueError` - The value provided cannot be parsed as native representation - -#### to\_native\_representation - -```python -def to_native_representation() -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L198) - -Represents the configuration instance in its native form ie. database connection string or JSON serialized GCP service credentials file. - -**Raises**: - -- `NotImplementedError` - This configuration does not have a native representation - - -**Returns**: - -- `Any` - A native representation of the configuration - -#### get\_resolvable\_fields - -```python -@classmethod -def get_resolvable_fields(cls) -> Dict[str, type] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L219) - -Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned - -#### is\_partial - -```python -def is_partial() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L226) - -Returns True when any required resolvable field has its value missing. - -#### copy - -```python -def copy() -> _T -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L239) - -Returns a deep copy of the configuration instance - -#### \_\_iter\_\_ - -```python -def __iter__() -> Iterator[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L266) - -Iterator or valid key names - -## CredentialsConfiguration Objects - -```python -@configspec -class CredentialsConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L307) - -Base class for all credentials. Credentials are configurations that may be stored only by providers supporting secrets. - -#### \_\_init\_\_ - -```python -def __init__(init_value: Any = None) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L312) - -Initializes credentials from `init_value` - -Init value may be a native representation of the credentials or a dict. In case of native representation (for example a connection string or JSON with service account credentials) -a `parse_native_representation` method will be used to parse it. In case of a dict, the credentials object will be updated with key: values of the dict. -Unexpected values in the dict will be ignored. - -Credentials will be marked as resolved if all required fields are set. - -#### to\_native\_credentials - -```python -def to_native_credentials() -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L330) - -Returns native credentials object. - -By default calls `to_native_representation` method. - -#### \_\_str\_\_ - -```python -def __str__() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L337) - -Get string representation of credentials to be displayed, with all secret parts removed - -## CredentialsWithDefault Objects - -```python -class CredentialsWithDefault() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L342) - -A mixin for credentials that can be instantiated from default ie. from well known env variable with credentials - -## ContainerInjectableContext Objects - -```python -@configspec -class ContainerInjectableContext(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L358) - -Base class for all configurations that may be injected from a Container. Injectable configuration is called a context - -#### can\_create\_default - -If True, `Container` is allowed to create default context instance, if none exists - -#### add\_extras - -```python -def add_extras() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/base_configuration.py#L364) - -Called right after context was added to the container. Benefits mostly the config provider injection context which adds extra providers using the initial ones. - diff --git a/docs/website/docs/api_reference/common/configuration/specs/config_providers_context.md b/docs/website/docs/api_reference/common/configuration/specs/config_providers_context.md deleted file mode 100644 index e75dcd49ba..0000000000 --- a/docs/website/docs/api_reference/common/configuration/specs/config_providers_context.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -sidebar_label: config_providers_context -title: common.configuration.specs.config_providers_context ---- - -## ConfigProvidersContext Objects - -```python -@configspec -class ConfigProvidersContext(ContainerInjectableContext) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/config_providers_context.py#L22) - -Injectable list of providers used by the configuration `resolve` module - -#### add\_extras - -```python -def add_extras() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/config_providers_context.py#L34) - -Adds extra providers. Extra providers may use initial providers when setting up - diff --git a/docs/website/docs/api_reference/common/configuration/specs/config_section_context.md b/docs/website/docs/api_reference/common/configuration/specs/config_section_context.md deleted file mode 100644 index 98971befc5..0000000000 --- a/docs/website/docs/api_reference/common/configuration/specs/config_section_context.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -sidebar_label: config_section_context -title: common.configuration.specs.config_section_context ---- - -## ConfigSectionContext Objects - -```python -@configspec -class ConfigSectionContext(ContainerInjectableContext) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/config_section_context.py#L7) - -#### merge - -```python -def merge(existing: "ConfigSectionContext") -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/config_section_context.py#L17) - -Merges existing context into incoming using a merge style function - -#### source\_name - -```python -def source_name() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/config_section_context.py#L22) - -Gets name of a source from `sections` - -#### source\_section - -```python -def source_section() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/config_section_context.py#L28) - -Gets section of a source from `sections` - -#### prefer\_existing - -```python -@staticmethod -def prefer_existing(incoming: "ConfigSectionContext", - existing: "ConfigSectionContext") -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/config_section_context.py#L41) - -Prefer existing section context when merging this context before injecting - -#### resource\_merge\_style - -```python -@staticmethod -def resource_merge_style(incoming: "ConfigSectionContext", - existing: "ConfigSectionContext") -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/config_section_context.py#L48) - -If top level section is same and there are 3 sections it replaces second element (source module) from existing and keeps the 3rd element (name) - diff --git a/docs/website/docs/api_reference/common/configuration/specs/gcp_credentials.md b/docs/website/docs/api_reference/common/configuration/specs/gcp_credentials.md deleted file mode 100644 index 0a67f76d4b..0000000000 --- a/docs/website/docs/api_reference/common/configuration/specs/gcp_credentials.md +++ /dev/null @@ -1,129 +0,0 @@ ---- -sidebar_label: gcp_credentials -title: common.configuration.specs.gcp_credentials ---- - -## GcpCredentials Objects - -```python -@configspec -class GcpCredentials(CredentialsConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L15) - -#### location - -DEPRECATED! and present only for backward compatibility. please set bigquery location in BigQuery configuration - -#### to\_native\_credentials - -```python -def to_native_credentials() -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L30) - -Returns respective native credentials for service account or oauth2 that can be passed to google clients - -## GcpServiceAccountCredentialsWithoutDefaults Objects - -```python -@configspec -class GcpServiceAccountCredentialsWithoutDefaults(GcpCredentials) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L42) - -#### type - -noqa: A003 - -#### parse\_native\_representation - -```python -def parse_native_representation(native_value: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L47) - -Accepts ServiceAccountCredentials as native value. In other case reverts to serialized services.json - -#### to\_native\_credentials - -```python -def to_native_credentials() -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L83) - -Returns google.oauth2.service_account.Credentials - -## GcpOAuthCredentialsWithoutDefaults Objects - -```python -@configspec -class GcpOAuthCredentialsWithoutDefaults(GcpCredentials, OAuth2Credentials) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L98) - -#### parse\_native\_representation - -```python -def parse_native_representation(native_value: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L103) - -Accepts Google OAuth2 credentials as native value. In other case reverts to serialized oauth client secret json - -#### on\_partial - -```python -def on_partial() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L151) - -Allows for an empty refresh token if the session is interactive or tty is attached - -#### to\_native\_credentials - -```python -def to_native_credentials() -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L183) - -Returns google.oauth2.credentials.Credentials - -## GcpDefaultCredentials Objects - -```python -@configspec -class GcpDefaultCredentials(CredentialsWithDefault, GcpCredentials) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L213) - -#### parse\_native\_representation - -```python -def parse_native_representation(native_value: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L217) - -Accepts google credentials as native value - -#### on\_partial - -```python -def on_partial() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/gcp_credentials.py#L248) - -Looks for default google credentials and resolves configuration if found. Otherwise continues as partial - diff --git a/docs/website/docs/api_reference/common/configuration/specs/known_sections.md b/docs/website/docs/api_reference/common/configuration/specs/known_sections.md deleted file mode 100644 index c91f652de2..0000000000 --- a/docs/website/docs/api_reference/common/configuration/specs/known_sections.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -sidebar_label: known_sections -title: common.configuration.specs.known_sections ---- - -#### SOURCES - -a top section holding source and resource configs often within their own sections named after modules they are in - -#### DESTINATION - -a top section holding sections named after particular destinations with configurations and credentials. - -#### LOAD - -load and load storage configuration - -#### NORMALIZE - -normalize and normalize storage configuration - -#### EXTRACT - -extract stage of the pipeline - -#### PROVIDERS - -secrets and config providers - -#### DATA\_WRITER - -default section holding BufferedDataWriter settings - -#### DBT\_PACKAGE\_RUNNER - -dbt package runner configuration (DBTRunnerConfiguration) - diff --git a/docs/website/docs/api_reference/common/configuration/specs/run_configuration.md b/docs/website/docs/api_reference/common/configuration/specs/run_configuration.md deleted file mode 100644 index 4804e93da0..0000000000 --- a/docs/website/docs/api_reference/common/configuration/specs/run_configuration.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -sidebar_label: run_configuration -title: common.configuration.specs.run_configuration ---- - -## RunConfiguration Objects - -```python -@configspec -class RunConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/specs/run_configuration.py#L13) - -#### sentry\_dsn - -keep None to disable Sentry - -#### dlthub\_telemetry - -enable or disable dlthub telemetry - -#### request\_timeout - -Timeout for http requests - -#### request\_max\_attempts - -Max retry attempts for http clients - -#### request\_backoff\_factor - -Multiplier applied to exponential retry delay for http requests - -#### request\_max\_retry\_delay - -Maximum delay between http request retries - diff --git a/docs/website/docs/api_reference/common/configuration/utils.md b/docs/website/docs/api_reference/common/configuration/utils.md deleted file mode 100644 index aefacaa895..0000000000 --- a/docs/website/docs/api_reference/common/configuration/utils.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -sidebar_label: utils -title: common.configuration.utils ---- - -#### add\_config\_to\_env - -```python -def add_config_to_env( - config: BaseConfiguration, sections: Tuple[str, ...] = ()) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/utils.py#L135) - -Writes values in configuration back into environment using the naming convention of EnvironProvider. Will descend recursively if embedded BaseConfiguration instances are found - -#### add\_config\_dict\_to\_env - -```python -def add_config_dict_to_env(dict_: Mapping[str, Any], - sections: Tuple[str, ...] = (), - overwrite_keys: bool = False) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/configuration/utils.py#L142) - -Writes values in dict_ back into environment using the naming convention of EnvironProvider. Applies `sections` if specified. Does not overwrite existing keys by default - diff --git a/docs/website/docs/api_reference/common/destination/capabilities.md b/docs/website/docs/api_reference/common/destination/capabilities.md deleted file mode 100644 index 6ae33b541b..0000000000 --- a/docs/website/docs/api_reference/common/destination/capabilities.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -sidebar_label: capabilities -title: common.destination.capabilities ---- - -## DestinationCapabilitiesContext Objects - -```python -@configspec -class DestinationCapabilitiesContext(ContainerInjectableContext) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/capabilities.py#L25) - -Injectable destination capabilities required for many Pipeline stages ie. normalize - diff --git a/docs/website/docs/api_reference/common/destination/reference.md b/docs/website/docs/api_reference/common/destination/reference.md deleted file mode 100644 index aa76e684bd..0000000000 --- a/docs/website/docs/api_reference/common/destination/reference.md +++ /dev/null @@ -1,416 +0,0 @@ ---- -sidebar_label: reference -title: common.destination.reference ---- - -## DestinationClientConfiguration Objects - -```python -@configspec -class DestinationClientConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L43) - -#### destination\_name - -which destination to load data to - -#### fingerprint - -```python -def fingerprint() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L47) - -Returns a destination fingerprint which is a hash of selected configuration fields. ie. host in case of connection string - -#### \_\_str\_\_ - -```python -def __str__() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L51) - -Return displayable destination location - -## DestinationClientDwhConfiguration Objects - -```python -@configspec -class DestinationClientDwhConfiguration(DestinationClientConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L62) - -Configuration of a destination that supports datasets/schemas - -#### dataset\_name - -dataset name in the destination to load data to, for schemas that are not default schema, it is used as dataset prefix - -#### default\_schema\_name - -name of default schema to be used to name effective dataset to load data to - -#### replace\_strategy - -How to handle replace disposition for this destination, can be classic or staging - -#### normalize\_dataset\_name - -```python -def normalize_dataset_name(schema: Schema) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L72) - -Builds full db dataset (schema) name out of configured dataset name and schema name: {dataset_name}_{schema.name}. The resulting name is normalized. - -If default schema name is None or equals schema.name, the schema suffix is skipped. - -## DestinationClientStagingConfiguration Objects - -```python -@configspec -class DestinationClientStagingConfiguration(DestinationClientDwhConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L98) - -Configuration of a staging destination, able to store files with desired `layout` at `bucket_url`. - -Also supports datasets and can act as standalone destination. - -## DestinationClientDwhWithStagingConfiguration Objects - -```python -@configspec -class DestinationClientDwhWithStagingConfiguration( - DestinationClientDwhConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L122) - -Configuration of a destination that can take data from staging destination - -#### staging\_config - -configuration of the staging, if present, injected at runtime - -## LoadJob Objects - -```python -class LoadJob() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L141) - -Represents a job that loads a single file - -Each job starts in "running" state and ends in one of terminal states: "retry", "failed" or "completed". -Each job is uniquely identified by a file name. The file is guaranteed to exist in "running" state. In terminal state, the file may not be present. -In "running" state, the loader component periodically gets the state via `status()` method. When terminal state is reached, load job is discarded and not called again. -`exception` method is called to get error information in "failed" and "retry" states. - -The `__init__` method is responsible to put the Job in "running" state. It may raise `LoadClientTerminalException` and `LoadClientTransientException` to -immediately transition job into "failed" or "retry" state respectively. - -#### \_\_init\_\_ - -```python -def __init__(file_name: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L152) - -File name is also a job id (or job id is deterministically derived) so it must be globally unique - -#### state - -```python -@abstractmethod -def state() -> TLoadJobState -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L162) - -Returns current state. Should poll external resource if necessary. - -#### file\_name - -```python -def file_name() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L166) - -A name of the job file - -#### job\_id - -```python -def job_id() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L170) - -The job id that is derived from the file name - -#### exception - -```python -@abstractmethod -def exception() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L178) - -The exception associated with failed or retry states - -## NewLoadJob Objects - -```python -class NewLoadJob(LoadJob) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L183) - -Adds a trait that allows to save new job file - -#### new\_file\_path - -```python -@abstractmethod -def new_file_path() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L187) - -Path to a newly created temporary job file. If empty, no followup job should be created - -## FollowupJob Objects - -```python -class FollowupJob() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L192) - -Adds a trait that allows to create a followup job - -## JobClientBase Objects - -```python -class JobClientBase(ABC) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L198) - -#### initialize\_storage - -```python -@abstractmethod -def initialize_storage(truncate_tables: Iterable[str] = None) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L207) - -Prepares storage to be used ie. creates database schema or file system folder. Truncates requested tables. - -#### is\_storage\_initialized - -```python -@abstractmethod -def is_storage_initialized() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L213) - -Returns if storage is ready to be read/written. - -#### drop\_storage - -```python -@abstractmethod -def drop_storage() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L218) - -Brings storage back into not initialized state. Typically data in storage is destroyed. - -#### update\_stored\_schema - -```python -def update_stored_schema( - only_tables: Iterable[str] = None, - expected_update: TSchemaTables = None) -> Optional[TSchemaTables] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L222) - -Updates storage to the current schema. - -Implementations should not assume that `expected_update` is the exact difference between destination state and the self.schema. This is only the case if -destination has single writer and no other processes modify the schema. - -**Arguments**: - -- `only_tables` _Sequence[str], optional_ - Updates only listed tables. Defaults to None. -- `expected_update` _TSchemaTables, optional_ - Update that is expected to be applied to the destination - -**Returns**: - -- `Optional[TSchemaTables]` - Returns an update that was applied at the destination. - -#### start\_file\_load - -```python -@abstractmethod -def start_file_load(table: TTableSchema, file_path: str, - load_id: str) -> LoadJob -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L238) - -Creates and starts a load job for a particular `table` with content in `file_path` - -#### restore\_file\_load - -```python -@abstractmethod -def restore_file_load(file_path: str) -> LoadJob -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L243) - -Finds and restores already started loading job identified by `file_path` if destination supports it. - -#### create\_table\_chain\_completed\_followup\_jobs - -```python -def create_table_chain_completed_followup_jobs( - table_chain: Sequence[TTableSchema]) -> List[NewLoadJob] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L251) - -Creates a list of followup jobs that should be executed after a table chain is completed - -#### complete\_load - -```python -@abstractmethod -def complete_load(load_id: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L256) - -Marks the load package with `load_id` as completed in the destination. Before such commit is done, the data with `load_id` is invalid. - -## WithStateSync Objects - -```python -class WithStateSync(ABC) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L291) - -#### get\_stored\_schema - -```python -@abstractmethod -def get_stored_schema() -> Optional[StorageSchemaInfo] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L294) - -Retrieves newest schema from destination storage - -#### get\_stored\_state - -```python -@abstractmethod -def get_stored_state(pipeline_name: str) -> Optional[StateInfo] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L303) - -Loads compressed state from destination storage - -## WithStagingDataset Objects - -```python -class WithStagingDataset(ABC) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L308) - -Adds capability to use staging dataset and request it from the loader - -#### get\_stage\_dispositions - -```python -@abstractmethod -def get_stage_dispositions() -> List[TWriteDisposition] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L312) - -Returns a list of write dispositions that require staging dataset - -#### with\_staging\_dataset - -```python -@abstractmethod -def with_staging_dataset() -> ContextManager["JobClientBase"] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L317) - -Executes job client methods on staging dataset - -## DestinationReference Objects - -```python -class DestinationReference(Protocol) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L325) - -#### capabilities - -```python -def capabilities() -> DestinationCapabilitiesContext -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L329) - -Destination capabilities ie. supported loader file formats, identifier name lengths, naming conventions, escape function etc. - -#### client - -```python -def client( - schema: Schema, - initial_config: DestinationClientConfiguration = config.value -) -> "JobClientBase" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L332) - -A job client responsible for starting and resuming load jobs - -#### spec - -```python -def spec() -> Type[DestinationClientConfiguration] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/destination/reference.py#L335) - -A spec of destination configuration that also contains destination credentials - diff --git a/docs/website/docs/api_reference/common/exceptions.md b/docs/website/docs/api_reference/common/exceptions.md deleted file mode 100644 index bc44cbda52..0000000000 --- a/docs/website/docs/api_reference/common/exceptions.md +++ /dev/null @@ -1,81 +0,0 @@ ---- -sidebar_label: exceptions -title: common.exceptions ---- - -## DltException Objects - -```python -class DltException(Exception) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/exceptions.py#L4) - -#### \_\_reduce\_\_ - -```python -def __reduce__() -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/exceptions.py#L5) - -Enables exceptions with parametrized constructor to be pickled - -## TerminalException Objects - -```python -class TerminalException(BaseException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/exceptions.py#L32) - -Marks an exception that cannot be recovered from, should be mixed in into concrete exception class - -## TransientException Objects - -```python -class TransientException(BaseException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/exceptions.py#L38) - -Marks an exception in operation that can be retried, should be mixed in into concrete exception class - -## TerminalValueError Objects - -```python -class TerminalValueError(ValueError, TerminalException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/exceptions.py#L44) - -ValueError that is unrecoverable - -## SignalReceivedException Objects - -```python -class SignalReceivedException(KeyboardInterrupt, TerminalException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/exceptions.py#L50) - -Raises when signal comes. Derives from `BaseException` to not be caught in regular exception handlers. - -## PipelineException Objects - -```python -class PipelineException(DltException) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/exceptions.py#L180) - -#### \_\_init\_\_ - -```python -def __init__(pipeline_name: str, msg: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/exceptions.py#L181) - -Base class for all pipeline exceptions. Should not be raised. - diff --git a/docs/website/docs/api_reference/common/git.md b/docs/website/docs/api_reference/common/git.md deleted file mode 100644 index b47d9b980a..0000000000 --- a/docs/website/docs/api_reference/common/git.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -sidebar_label: git -title: common.git ---- - -#### is\_clean\_and\_synced - -```python -def is_clean_and_synced(repo: Repo) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/git.py#L34) - -Checks if repo is clean and synced with origin - -#### force\_clone\_repo - -```python -def force_clone_repo(repo_url: str, - repo_storage: FileStorage, - repo_name: str, - branch: Optional[str] = None, - with_git_command: Optional[str] = None) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/git.py#L82) - -Deletes the working directory repo_storage.root/repo_name and clones the `repo_url` into it. Will checkout `branch` if provided - -#### get\_fresh\_repo\_files - -```python -def get_fresh_repo_files( - repo_location: str, - working_dir: str = None, - branch: Optional[str] = None, - with_git_command: Optional[str] = None) -> FileStorage -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/git.py#L101) - -Returns a file storage leading to the newest repository files. If `repo_location` is url, file will be checked out into `working_dir/repo_name` - diff --git a/docs/website/docs/api_reference/common/json/__init__.md b/docs/website/docs/api_reference/common/json/__init__.md deleted file mode 100644 index be94898a59..0000000000 --- a/docs/website/docs/api_reference/common/json/__init__.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -sidebar_label: json -title: common.json ---- - -## SupportsJson Objects - -```python -class SupportsJson(Protocol) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/json/__init__.py#L20) - -Minimum adapter for different json parser implementations - -#### custom\_pua\_remove - -```python -def custom_pua_remove(obj: Any) -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/json/__init__.py#L168) - -Removes the PUA data type marker and leaves the correctly serialized type representation. Unmarked values are returned as-is. - diff --git a/docs/website/docs/api_reference/common/jsonpath.md b/docs/website/docs/api_reference/common/jsonpath.md deleted file mode 100644 index d197742adb..0000000000 --- a/docs/website/docs/api_reference/common/jsonpath.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -sidebar_label: jsonpath -title: common.jsonpath ---- - -#### TJsonPath - -Jsonpath compiled or str - -#### TAnyJsonPath - -A single or multiple jsonpaths - -#### delete\_matches - -```python -def delete_matches(paths: TAnyJsonPath, data: DictStrAny) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/jsonpath.py#L25) - -Remove all keys from `data` matching any of given json path(s). -Filtering is done in place. - -#### find\_values - -```python -def find_values(path: TJsonPath, data: DictStrAny) -> List[Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/jsonpath.py#L33) - -Return a list of values found under the given json path - -#### resolve\_paths - -```python -def resolve_paths(paths: TAnyJsonPath, data: DictStrAny) -> List[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/jsonpath.py#L39) - -Return a list of paths resolved against `data`. The return value is a list of strings. - -**Example**: - - >>> resolve_paths('$.a.items[*].b', {'a': {'items': [{'b': 2}, {'b': 3}]}}) - >>> # ['a.items.[0].b', 'a.items.[1].b'] - diff --git a/docs/website/docs/api_reference/common/libs/pydantic.md b/docs/website/docs/api_reference/common/libs/pydantic.md deleted file mode 100644 index 5075790759..0000000000 --- a/docs/website/docs/api_reference/common/libs/pydantic.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -sidebar_label: pydantic -title: common.libs.pydantic ---- - -#### pydantic\_to\_table\_schema\_columns - -```python -def pydantic_to_table_schema_columns( - model: Union[BaseModel, Type[BaseModel]], - skip_complex_types: bool = False) -> TTableSchemaColumns -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/libs/pydantic.py#L14) - -Convert a pydantic model to a table schema columns dict - -**Arguments**: - -- `model` - The pydantic model to convert. Can be a class or an instance. -- `skip_complex_types` - If True, columns of complex types (`dict`, `list`, `BaseModel`) will be excluded from the result. - - -**Returns**: - -- `TTableSchemaColumns` - table schema columns dict - diff --git a/docs/website/docs/api_reference/common/normalizers/configuration.md b/docs/website/docs/api_reference/common/normalizers/configuration.md deleted file mode 100644 index dd09e66d1b..0000000000 --- a/docs/website/docs/api_reference/common/normalizers/configuration.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -sidebar_label: configuration -title: common.normalizers.configuration ---- - -## NormalizersConfiguration Objects - -```python -@configspec -class NormalizersConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/configuration.py#L12) - -#### destination\_capabilities - -injectable - diff --git a/docs/website/docs/api_reference/common/normalizers/json/__init__.md b/docs/website/docs/api_reference/common/normalizers/json/__init__.md deleted file mode 100644 index f5825bec86..0000000000 --- a/docs/website/docs/api_reference/common/normalizers/json/__init__.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -sidebar_label: json -title: common.normalizers.json ---- - -## SupportsDataItemNormalizer Objects - -```python -class SupportsDataItemNormalizer(Protocol) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/json/__init__.py#L43) - -Expected of modules defining data item normalizer - -#### DataItemNormalizer - -A class with a name DataItemNormalizer deriving from normalizers.json.DataItemNormalizer - -#### wrap\_in\_dict - -```python -def wrap_in_dict(item: Any) -> DictStrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/json/__init__.py#L50) - -Wraps `item` that is not a dictionary into dictionary that can be json normalized - diff --git a/docs/website/docs/api_reference/common/normalizers/json/relational.md b/docs/website/docs/api_reference/common/normalizers/json/relational.md deleted file mode 100644 index 38f75539b3..0000000000 --- a/docs/website/docs/api_reference/common/normalizers/json/relational.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -sidebar_label: relational -title: common.normalizers.json.relational ---- - -#### EMPTY\_KEY\_IDENTIFIER - -replace empty keys with this - -## TDataItemRowChild Objects - -```python -class TDataItemRowChild(TDataItemRow) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/json/relational.py#L26) - -#### value - -for lists of simple types - diff --git a/docs/website/docs/api_reference/common/normalizers/naming/naming.md b/docs/website/docs/api_reference/common/normalizers/naming/naming.md deleted file mode 100644 index 213ead80a2..0000000000 --- a/docs/website/docs/api_reference/common/normalizers/naming/naming.md +++ /dev/null @@ -1,115 +0,0 @@ ---- -sidebar_label: naming -title: common.normalizers.naming.naming ---- - -## NamingConvention Objects - -```python -class NamingConvention(ABC) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L9) - -#### normalize\_identifier - -```python -@abstractmethod -def normalize_identifier(identifier: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L18) - -Normalizes and shortens the identifier according to naming convention in this function code - -#### normalize\_table\_identifier - -```python -def normalize_table_identifier(identifier: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L27) - -Normalizes and shortens identifier that will function as a dataset, table or schema name, defaults to `normalize_identifier` - -#### make\_path - -```python -@abstractmethod -def make_path(*identifiers: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L32) - -Builds path out of identifiers. Identifiers are neither normalized nor shortened - -#### break\_path - -```python -@abstractmethod -def break_path(path: str) -> Sequence[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L37) - -Breaks path into sequence of identifiers - -#### normalize\_path - -```python -def normalize_path(path: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L41) - -Breaks path into identifiers, normalizes components, reconstitutes and shortens the path - -#### normalize\_tables\_path - -```python -def normalize_tables_path(path: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L47) - -Breaks path of table identifiers, normalizes components, reconstitutes and shortens the path - -#### shorten\_fragments - -```python -def shorten_fragments(*normalized_idents: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L53) - -Reconstitutes and shortens the path of normalized identifiers - -#### shorten\_identifier - -```python -@staticmethod -@lru_cache(maxsize=None) -def shorten_identifier(normalized_ident: str, - identifier: str, - max_length: int, - collision_prob: float = _DEFAULT_COLLISION_PROB) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L62) - -Shortens the `name` to `max_length` and adds a tag to it to make it unique. Tag may be placed in the middle or at the end - -## SupportsNamingConvention Objects - -```python -class SupportsNamingConvention(Protocol) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/naming/naming.py#L90) - -Expected of modules defining naming convention - -#### NamingConvention - -A class with a name NamingConvention deriving from normalizers.naming.NamingConvention - diff --git a/docs/website/docs/api_reference/common/normalizers/typing.md b/docs/website/docs/api_reference/common/normalizers/typing.md deleted file mode 100644 index a0e659782b..0000000000 --- a/docs/website/docs/api_reference/common/normalizers/typing.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -sidebar_label: typing -title: common.normalizers.typing ---- - -## TJSONNormalizer Objects - -```python -class TJSONNormalizer(TypedDict) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/typing.py#L6) - -#### config - -config is a free form and is consumed by `module` - diff --git a/docs/website/docs/api_reference/common/normalizers/utils.md b/docs/website/docs/api_reference/common/normalizers/utils.md deleted file mode 100644 index 7f5dc5a239..0000000000 --- a/docs/website/docs/api_reference/common/normalizers/utils.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -sidebar_label: utils -title: common.normalizers.utils ---- - -#### explicit\_normalizers - -```python -@with_config(spec=NormalizersConfiguration) -def explicit_normalizers( - naming: str = dlt.config.value, - json_normalizer: TJSONNormalizer = dlt.config.value -) -> TNormalizersConfig -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/utils.py#L17) - -Gets explicitly configured normalizers - via config or destination caps. May return None as naming or normalizer - -#### import\_normalizers - -```python -@with_config -def import_normalizers( - normalizers_config: TNormalizersConfig, - destination_capabilities: DestinationCapabilitiesContext = None -) -> Tuple[TNormalizersConfig, NamingConvention, - Type[DataItemNormalizer[Any]]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/normalizers/utils.py#L26) - -Imports the normalizers specified in `normalizers_config` or taken from defaults. Returns the updated config and imported modules. - -`destination_capabilities` are used to get max length of the identifier. - diff --git a/docs/website/docs/api_reference/common/pipeline.md b/docs/website/docs/api_reference/common/pipeline.md deleted file mode 100644 index 65dfe3fedf..0000000000 --- a/docs/website/docs/api_reference/common/pipeline.md +++ /dev/null @@ -1,335 +0,0 @@ ---- -sidebar_label: pipeline -title: common.pipeline ---- - -## ExtractInfo Objects - -```python -class ExtractInfo(NamedTuple) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L32) - -A tuple holding information on extracted data items. Returned by pipeline `extract` method. - -## NormalizeInfo Objects - -```python -class NormalizeInfo(NamedTuple) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L47) - -A tuple holding information on normalized data items. Returned by pipeline `normalize` method. - -#### asdict - -```python -def asdict() -> DictStrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L52) - -A dictionary representation of NormalizeInfo that can be loaded with `dlt` - -## LoadInfo Objects - -```python -class LoadInfo(NamedTuple) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L72) - -A tuple holding the information on recently loaded packages. Returned by pipeline `run` and `load` methods - -#### loads\_ids - -ids of the loaded packages - -#### load\_packages - -Information on loaded packages - -#### asdict - -```python -def asdict() -> DictStrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L88) - -A dictionary representation of LoadInfo that can be loaded with `dlt` - -#### has\_failed\_jobs - -```python -@property -def has_failed_jobs() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L125) - -Returns True if any of the load packages has a failed job. - -#### raise\_on\_failed\_jobs - -```python -def raise_on_failed_jobs() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L132) - -Raises `DestinationHasFailedJobs` exception if any of the load packages has a failed job. - -## TPipelineLocalState Objects - -```python -class TPipelineLocalState(TypedDict) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L142) - -#### first\_run - -Indicates a first run of the pipeline, where run ends with successful loading of data - -## TPipelineState Objects - -```python -class TPipelineState(TypedDict) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L149) - -Schema for a pipeline state that is stored within the pipeline working directory - -#### default\_schema\_name - -Name of the first schema added to the pipeline to which all the resources without schemas will be added - -#### schema\_names - -All the schemas present within the pipeline working directory - -## SupportsPipeline Objects - -```python -class SupportsPipeline(Protocol) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L171) - -A protocol with core pipeline operations that lets high level abstractions ie. sources to access pipeline methods and properties - -#### pipeline\_name - -Name of the pipeline - -#### default\_schema\_name - -Name of the default schema - -#### destination - -The destination reference which is ModuleType. `destination.__name__` returns the name string - -#### dataset\_name - -Name of the dataset to which pipeline will be loaded to - -#### runtime\_config - -A configuration of runtime options like logging level and format and various tracing options - -#### working\_dir - -A working directory of the pipeline - -#### pipeline\_salt - -A configurable pipeline secret to be used as a salt or a seed for encryption key - -#### first\_run - -Indicates a first run of the pipeline, where run ends with successful loading of the data - -#### state - -```python -@property -def state() -> TPipelineState -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L191) - -Returns dictionary with pipeline state - -#### set\_local\_state\_val - -```python -def set_local_state_val(key: str, value: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L194) - -Sets value in local state. Local state is not synchronized with destination. - -#### get\_local\_state\_val - -```python -def get_local_state_val(key: str) -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L197) - -Gets value from local state. Local state is not synchronized with destination. - -## PipelineContext Objects - -```python -@configspec -class PipelineContext(ContainerInjectableContext) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L240) - -#### pipeline - -```python -def pipeline() -> SupportsPipeline -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L246) - -Creates or returns exiting pipeline - -#### \_\_init\_\_ - -```python -def __init__(deferred_pipeline: Callable[..., SupportsPipeline]) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L269) - -Initialize the context with a function returning the Pipeline object to allow creation on first use - -#### pipeline\_state - -```python -def pipeline_state( - container: Container, - initial_default: TPipelineState = None) -> Tuple[TPipelineState, bool] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L285) - -Gets value of the state from context or active pipeline, if none found returns `initial_default` - -Injected state is called "writable": it is injected by the `Pipeline` class and all the changes will be persisted. -The state coming from pipeline context or `initial_default` is called "read only" and all the changes to it will be discarded - -Returns tuple (state, writable) - -#### source\_state - -```python -def source_state() -> DictStrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L324) - -Returns a dictionary with the source-scoped state. Source-scoped state may be shared across the resources of a particular source. Please avoid using source scoped state. Check -the `resource_state` function for resource-scoped state that is visible within particular resource. Dlt state is preserved across pipeline runs and may be used to implement incremental loads. - -### Summary -The source state is a python dictionary-like object that is available within the `@dlt.source` and `@dlt.resource` decorated functions and may be read and written to. -The data within the state is loaded into destination together with any other extracted data and made automatically available to the source/resource extractor functions when they are run next time. -When using the state: -* The source state is scoped to a particular source and will be stored under the source name in the pipeline state -* It is possible to share state across many sources if they share a schema with the same name -* Any JSON-serializable values can be written and the read from the state. `dlt` dumps and restores instances of Python bytes, DateTime, Date and Decimal types. -* The state available in the source decorated function is read only and any changes will be discarded. -* The state available in the resource decorated function is writable and written values will be available on the next pipeline run - -#### resource\_state - -```python -def resource_state(resource_name: str = None, - source_state_: Optional[DictStrAny] = None) -> DictStrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L371) - -Returns a dictionary with the resource-scoped state. Resource-scoped state is visible only to resource requesting the access. Dlt state is preserved across pipeline runs and may be used to implement incremental loads. - -Note that this function accepts the resource name as optional argument. There are rare cases when `dlt` is not able to resolve resource name due to requesting function -working in different thread than the main. You'll need to pass the name explicitly when you request resource_state from async functions or functions decorated with @defer. - -### Summary -The resource state is a python dictionary-like object that is available within the `@dlt.resource` decorated functions and may be read and written to. -The data within the state is loaded into destination together with any other extracted data and made automatically available to the source/resource extractor functions when they are run next time. -When using the state: -* The resource state is scoped to a particular resource requesting it. -* Any JSON-serializable values can be written and the read from the state. `dlt` dumps and restores instances of Python bytes, DateTime, Date and Decimal types. -* The state available in the resource decorated function is writable and written values will be available on the next pipeline run - -### Example -The most typical use case for the state is to implement incremental load. ->>> @dlt.resource(write_disposition="append") ->>> def players_games(chess_url, players, start_month=None, end_month=None): ->>> checked_archives = dlt.current.resource_state().setdefault("archives", []) ->>> archives = players_archives(chess_url, players) ->>> for url in archives: ->>> if url in checked_archives: ->>> print(f"skipping archive {url}") ->>> continue ->>> else: ->>> print(f"getting archive {url}") ->>> checked_archives.append(url) ->>> # get the filtered archive ->>> r = requests.get(url) ->>> r.raise_for_status() ->>> yield r.json().get("games", []) - -Here we store all the urls with game archives in the state and we skip loading them on next run. The archives are immutable. The state will grow with the coming months (and more players). -Up to few thousand archives we should be good though. - -**Arguments**: - -- `resource_name` _str, optional_ - forces to use state for a resource with this name. Defaults to None. -- `source_state_` _Optional[DictStrAny], optional_ - Alternative source state. Defaults to None. - - -**Raises**: - -- `ResourceNameNotAvailable` - Raise if used outside of resource context or from a different thread than main - - -**Returns**: - -- `DictStrAny` - State dictionary - -#### get\_dlt\_pipelines\_dir - -```python -def get_dlt_pipelines_dir() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L444) - -Gets default directory where pipelines' data will be stored -1. in user home directory ~/.dlt/pipelines/ -2. if current user is root in /var/dlt/pipelines -3. if current user does not have a home directory in /tmp/dlt/pipelines - -#### get\_dlt\_repos\_dir - -```python -def get_dlt_repos_dir() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/pipeline.py#L453) - -Gets default directory where command repositories will be stored - diff --git a/docs/website/docs/api_reference/common/reflection/utils.md b/docs/website/docs/api_reference/common/reflection/utils.md deleted file mode 100644 index a355c407c8..0000000000 --- a/docs/website/docs/api_reference/common/reflection/utils.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -sidebar_label: utils -title: common.reflection.utils ---- - -#### get\_literal\_defaults - -```python -def get_literal_defaults(node: ast.FunctionDef) -> Dict[str, str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/reflection/utils.py#L9) - -Extract defaults from function definition node literally, as pieces of source code - -#### get\_func\_def\_node - -```python -def get_func_def_node(f: AnyFun) -> ast.FunctionDef -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/reflection/utils.py#L33) - -Finds the function definition node for function f by parsing the source code of the f's module - -#### find\_outer\_func\_def - -```python -def find_outer_func_def(node: ast.AST) -> Optional[ast.FunctionDef] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/reflection/utils.py#L49) - -Finds the outer function definition node in which the 'node' is contained. Returns None if 'node' is toplevel. - -#### set\_ast\_parents - -```python -def set_ast_parents(tree: ast.AST) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/reflection/utils.py#L60) - -Walks AST tree and sets the `parent` attr in each node to the node's parent. Toplevel nodes (parent is a `tree`) have the `parent` attr set to None. - -#### creates\_func\_def\_name\_node - -```python -def creates_func_def_name_node(func_def: ast.FunctionDef, - source_lines: Sequence[str]) -> ast.Name -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/reflection/utils.py#L67) - -Recreate function name as a ast.Name with known source code location - -#### rewrite\_python\_script - -```python -def rewrite_python_script( - source_script_lines: List[str], - transformed_nodes: List[Tuple[ast.AST, ast.AST]]) -> List[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/reflection/utils.py#L76) - -Replaces all the nodes present in `transformed_nodes` in the `script_lines`. The `transformed_nodes` is a tuple where the first element -is must be a node with full location information created out of `script_lines` - diff --git a/docs/website/docs/api_reference/common/runners/configuration.md b/docs/website/docs/api_reference/common/runners/configuration.md deleted file mode 100644 index 18715d6f1c..0000000000 --- a/docs/website/docs/api_reference/common/runners/configuration.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -sidebar_label: configuration -title: common.runners.configuration ---- - -## PoolRunnerConfiguration Objects - -```python -@configspec -class PoolRunnerConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/configuration.py#L10) - -#### pool\_type - -type of pool to run, must be set in derived configs - -#### workers - -how many threads/processes in the pool - -#### run\_sleep - -how long to sleep between runs with workload, seconds - diff --git a/docs/website/docs/api_reference/common/runners/runnable.md b/docs/website/docs/api_reference/common/runners/runnable.md deleted file mode 100644 index c47ff547f3..0000000000 --- a/docs/website/docs/api_reference/common/runners/runnable.md +++ /dev/null @@ -1,55 +0,0 @@ ---- -sidebar_label: runnable -title: common.runners.runnable ---- - -## Runnable Objects - -```python -class Runnable(ABC, Generic[TPool]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/runnable.py#L13) - -#### \_\_new\_\_ - -```python -def __new__(cls: Type["Runnable[TPool]"], *args: Any, - **kwargs: Any) -> "Runnable[TPool]" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/runnable.py#L22) - -Registers Runnable instance as running for a time when context is active. -Used with `~workermethod` decorator to pass a class instance to decorator function that must be static thus avoiding pickling such instance. - -**Arguments**: - -- `cls` _Type["Runnable"]_ - type of class to be instantiated - - -**Returns**: - -- `Runnable` - new class instance - -#### workermethod - -```python -def workermethod(f: TFun) -> TFun -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/runnable.py#L41) - -Decorator to be used on static method of Runnable to make it behave like instance method. -Expects that first parameter to decorated function is an instance `id` of Runnable that gets translated into Runnable instance. -Such instance is then passed as `self` to decorated function. - -**Arguments**: - -- `f` _TFun_ - worker function to be decorated - - -**Returns**: - -- `TFun` - wrapped worker function - diff --git a/docs/website/docs/api_reference/common/runners/stdout.md b/docs/website/docs/api_reference/common/runners/stdout.md deleted file mode 100644 index 226c86aefe..0000000000 --- a/docs/website/docs/api_reference/common/runners/stdout.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -sidebar_label: stdout -title: common.runners.stdout ---- - -#### exec\_to\_stdout - -```python -@contextmanager -def exec_to_stdout(f: AnyFun) -> Iterator[Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/stdout.py#L13) - -Executes parameter-less function f and encodes the pickled return value to stdout. In case of exceptions, encodes the pickled exceptions to stderr - -#### iter\_stdout\_with\_result - -```python -def iter_stdout_with_result(venv: Venv, command: str, *script_args: - Any) -> Generator[str, None, Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/stdout.py#L60) - -Yields stdout lines coming from remote process and returns the last result decoded with decode_obj. In case of exit code != 0 if exception is decoded -it will be raised, otherwise CalledProcessError is raised - diff --git a/docs/website/docs/api_reference/common/runners/synth_pickle.md b/docs/website/docs/api_reference/common/runners/synth_pickle.md deleted file mode 100644 index d3b4391563..0000000000 --- a/docs/website/docs/api_reference/common/runners/synth_pickle.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -sidebar_label: synth_pickle -title: common.runners.synth_pickle ---- - -## SynthesizingUnpickler Objects - -```python -class SynthesizingUnpickler(pickle.Unpickler) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/synth_pickle.py#L16) - -Unpickler that synthesizes missing types instead of raising - diff --git a/docs/website/docs/api_reference/common/runners/venv.md b/docs/website/docs/api_reference/common/runners/venv.md deleted file mode 100644 index 6fdb86f83c..0000000000 --- a/docs/website/docs/api_reference/common/runners/venv.md +++ /dev/null @@ -1,120 +0,0 @@ ---- -sidebar_label: venv -title: common.runners.venv ---- - -## Venv Objects - -```python -class Venv() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L22) - -Creates and wraps the Python Virtual Environment to allow for code execution - -#### \_\_init\_\_ - -```python -def __init__(context: types.SimpleNamespace, current: bool = False) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L25) - -Please use `Venv.create`, `Venv.restore` or `Venv.restore_current` methods to create Venv instance - -#### create - -```python -@classmethod -def create(cls, venv_dir: str, dependencies: List[str] = None) -> "Venv" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L31) - -Creates a new Virtual Environment at the location specified in `venv_dir` and installs `dependencies` via pip. Deletes partially created environment on failure. - -#### restore - -```python -@classmethod -def restore(cls, venv_dir: str, current: bool = False) -> "Venv" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L45) - -Restores Virtual Environment at `venv_dir` - -#### restore\_current - -```python -@classmethod -def restore_current(cls) -> "Venv" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L56) - -Wraps the current Python environment. - -#### delete\_environment - -```python -def delete_environment() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L75) - -Deletes the Virtual Environment. - -#### run\_command - -```python -def run_command(entry_point: str, *script_args: Any) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L88) - -Runs any `command` with specified `script_args`. Current `os.environ` and cwd is passed to executed process - -#### run\_script - -```python -def run_script(script_path: str, *script_args: Any) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L95) - -Runs a python `script` source with specified `script_args`. Current `os.environ` and cwd is passed to executed process - -#### run\_module - -```python -def run_module(module: str, *module_args: Any) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L107) - -Runs a python `module` with specified `module_args`. Current `os.environ` and cwd is passed to executed process - -#### is\_virtual\_env - -```python -@staticmethod -def is_virtual_env() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L125) - -Checks if we are running in virtual environment - -#### is\_venv\_activated - -```python -@staticmethod -def is_venv_activated() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runners/venv.py#L130) - -Checks if virtual environment is activated in the shell - diff --git a/docs/website/docs/api_reference/common/runtime/collector.md b/docs/website/docs/api_reference/common/runtime/collector.md deleted file mode 100644 index 24d120dfc0..0000000000 --- a/docs/website/docs/api_reference/common/runtime/collector.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -sidebar_label: collector -title: common.runtime.collector ---- - -## Collector Objects - -```python -class Collector(ABC) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L22) - -#### update - -```python -@abstractmethod -def update(name: str, - inc: int = 1, - total: int = None, - message: str = None, - label: str = None) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L27) - -Creates or updates a counter - -This function updates a counter `name` with a value `inc`. If counter does not exist, it is created with optional total value of `total`. -Depending on implementation `label` may be used to create nested counters and message to display additional information associated with a counter. - -**Arguments**: - -- `name` _str_ - An unique name of a counter, displayable. -- `inc` _int, optional_ - Increase amount. Defaults to 1. -- `total` _int, optional_ - Maximum value of a counter. Defaults to None which means unbound counter. -- `message` _str, optional_ - Additional message attached to a counter. Defaults to None. -- `label` _str, optional_ - Creates nested counter for counter `name`. Defaults to None. - -#### \_\_call\_\_ - -```python -def __call__(step: str) -> TCollector -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L52) - -Syntactic sugar for nicer context managers - -## NullCollector Objects - -```python -class NullCollector(Collector) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L65) - -A default counter that does not count anything. - -## DictCollector Objects - -```python -class DictCollector(Collector) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L78) - -A collector that just counts - -## LogCollector Objects - -```python -class LogCollector(Collector) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L95) - -A Collector that shows progress by writing to a Python logger or a console - -#### \_\_init\_\_ - -```python -def __init__(log_period: float = 1.0, - logger: Union[logging.Logger, TextIO] = sys.stdout, - log_level: int = logging.INFO, - dump_system_stats: bool = True) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L106) - -Collector writing to a `logger` every `log_period` seconds. The logger can be a Python logger instance, text stream, or None that will attach `dlt` logger - -**Arguments**: - -- `log_period` _float, optional_ - Time period in seconds between log updates. Defaults to 1.0. -- `logger` _logging.Logger | TextIO, optional_ - Logger or text stream to write log messages to. Defaults to stdio. -- `log_level` _str, optional_ - Log level for the logger. Defaults to INFO level -- `dump_system_stats` _bool, optional_ - Log memory and cpu usage. Defaults to True - -## TqdmCollector Objects - -```python -class TqdmCollector(Collector) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L212) - -A Collector that shows progress with `tqdm` progress bars - -#### \_\_init\_\_ - -```python -def __init__(single_bar: bool = False, **tqdm_kwargs: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L215) - -A Collector that uses tqdm to display counters as progress bars. Set `single_bar` to True to show just the main progress bar. Pass any config to tqdm in kwargs - -## AliveCollector Objects - -```python -class AliveCollector(Collector) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L256) - -A Collector that shows progress with `alive-progress` progress bars - -#### \_\_init\_\_ - -```python -def __init__(single_bar: bool = True, **alive_kwargs: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L259) - -Collector that uses alive_progress to display counters as progress bars. Set `single_bar` to True to show just the main progress bar. Pass any config to alive_progress in kwargs - -## EnlightenCollector Objects - -```python -class EnlightenCollector(Collector) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L303) - -A Collector that shows progress with `enlighten` progress and status bars that also allow for logging. - -#### \_\_init\_\_ - -```python -def __init__(single_bar: bool = False, **enlighten_kwargs: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/collector.py#L310) - -Collector that uses Enlighten to display counters as progress bars. Set `single_bar` to True to show just the main progress bar. Pass any config to Enlighten in kwargs - diff --git a/docs/website/docs/api_reference/common/runtime/exec_info.md b/docs/website/docs/api_reference/common/runtime/exec_info.md deleted file mode 100644 index a02ceed4ab..0000000000 --- a/docs/website/docs/api_reference/common/runtime/exec_info.md +++ /dev/null @@ -1,91 +0,0 @@ ---- -sidebar_label: exec_info -title: common.runtime.exec_info ---- - -#### exec\_info\_names - -```python -def exec_info_names() -> List[TExecInfoNames] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/exec_info.py#L31) - -Get names of execution environments - -#### dlt\_version\_info - -```python -def dlt_version_info(pipeline_name: str) -> StrStr -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/exec_info.py#L108) - -Gets dlt version info including commit and image version available in docker - -#### kube\_pod\_info - -```python -def kube_pod_info() -> StrStr -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/exec_info.py#L117) - -Extracts information on pod name, namespace and node name if running on Kubernetes - -#### github\_info - -```python -def github_info() -> StrStr -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/exec_info.py#L122) - -Extracts github info - -#### in\_continuous\_integration - -```python -def in_continuous_integration() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/exec_info.py#L131) - -Returns `True` if currently running inside a continuous integration context. - -#### is\_docker - -```python -def is_docker() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/exec_info.py#L136) - -Guess if we are running in docker environment. - -https://stackoverflow.com/questions/20010199/how-to-determine-if-a-process-runs-inside-lxc-docker - -**Returns**: - - `True` if we are running inside docker, `False` otherwise. - -#### is\_aws\_lambda - -```python -def is_aws_lambda() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/exec_info.py#L159) - -Return True if the process is running in the serverless platform AWS Lambda - -#### is\_gcp\_cloud\_function - -```python -def is_gcp_cloud_function() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/exec_info.py#L164) - -Return True if the process is running in the serverless platform GCP Cloud Functions - diff --git a/docs/website/docs/api_reference/common/runtime/logger.md b/docs/website/docs/api_reference/common/runtime/logger.md deleted file mode 100644 index 87464087bb..0000000000 --- a/docs/website/docs/api_reference/common/runtime/logger.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -sidebar_label: logger -title: common.runtime.logger ---- - -#### \_\_getattr\_\_ - -```python -def __getattr__(name: str) -> LogMethod -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/logger.py#L22) - -Forwards log method calls (debug, info, error etc.) to LOGGER - -#### metrics - -```python -def metrics(name: str, extra: StrAny, stacklevel: int = 1) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/logger.py#L35) - -Forwards metrics call to LOGGER - diff --git a/docs/website/docs/api_reference/common/runtime/segment.md b/docs/website/docs/api_reference/common/runtime/segment.md deleted file mode 100644 index fee01e02ba..0000000000 --- a/docs/website/docs/api_reference/common/runtime/segment.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -sidebar_label: segment -title: common.runtime.segment ---- - -dltHub telemetry using Segment - -#### track - -```python -def track(event_category: TEventCategory, event_name: str, - properties: DictStrAny) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/segment.py#L54) - -Tracks a telemetry event. - -The segment event name will be created as "{event_category}_{event_name} - -**Arguments**: - -- `event_category` - Category of the event: pipeline or cli -- `event_name` - Name of the event. -- `properties` - Dictionary containing the event's properties. - -#### before\_send - -```python -def before_send(event: DictStrAny) -> Optional[DictStrAny] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/segment.py#L83) - -Called before sending event. Does nothing, patch this function in the module for custom behavior - -#### get\_anonymous\_id - -```python -def get_anonymous_id() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/segment.py#L114) - -Creates or reads a anonymous user id - diff --git a/docs/website/docs/api_reference/common/runtime/sentry.md b/docs/website/docs/api_reference/common/runtime/sentry.md deleted file mode 100644 index 3a94eca881..0000000000 --- a/docs/website/docs/api_reference/common/runtime/sentry.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -sidebar_label: sentry -title: common.runtime.sentry ---- - -#### before\_send - -```python -def before_send(event: DictStrAny, - _unused_hint: Optional[StrAny] = None) -> Optional[DictStrAny] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/sentry.py#L55) - -Called by sentry before sending event. Does nothing, patch this function in the module for custom behavior - diff --git a/docs/website/docs/api_reference/common/runtime/signals.md b/docs/website/docs/api_reference/common/runtime/signals.md deleted file mode 100644 index 18a9ed5808..0000000000 --- a/docs/website/docs/api_reference/common/runtime/signals.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -sidebar_label: signals -title: common.runtime.signals ---- - -#### sleep - -```python -def sleep(sleep_seconds: float) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/signals.py#L39) - -A signal-aware version of sleep function. Will raise SignalReceivedException if signal was received during sleep period. - -#### delayed\_signals - -```python -@contextmanager -def delayed_signals() -> Iterator[None] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/signals.py#L50) - -Will delay signalling until `raise_if_signalled` is used or signalled `sleep` - diff --git a/docs/website/docs/api_reference/common/runtime/slack.md b/docs/website/docs/api_reference/common/runtime/slack.md deleted file mode 100644 index b1c4ed2c0a..0000000000 --- a/docs/website/docs/api_reference/common/runtime/slack.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -sidebar_label: slack -title: common.runtime.slack ---- - -#### send\_slack\_message - -```python -def send_slack_message(incoming_hook: str, - message: str, - is_markdown: bool = True) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/slack.py#L5) - -Sends a `message` to Slack `incoming_hook`, by default formatted as markdown. - diff --git a/docs/website/docs/api_reference/common/runtime/telemetry.md b/docs/website/docs/api_reference/common/runtime/telemetry.md deleted file mode 100644 index 0cca72ee38..0000000000 --- a/docs/website/docs/api_reference/common/runtime/telemetry.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -sidebar_label: telemetry -title: common.runtime.telemetry ---- - -#### with\_telemetry - -```python -def with_telemetry(category: TEventCategory, command: str, track_before: bool, - *args: str) -> Callable[[TFun], TFun] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/runtime/telemetry.py#L52) - -Adds telemetry to f: TFun and add optional f *args values to `properties` of telemetry event - diff --git a/docs/website/docs/api_reference/common/schema/schema.md b/docs/website/docs/api_reference/common/schema/schema.md deleted file mode 100644 index 126c714a90..0000000000 --- a/docs/website/docs/api_reference/common/schema/schema.md +++ /dev/null @@ -1,212 +0,0 @@ ---- -sidebar_label: schema -title: common.schema.schema ---- - -## Schema Objects - -```python -class Schema() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L20) - -#### naming - -Naming convention used by the schema to normalize identifiers - -#### data\_item\_normalizer - -Data item normalizer used by the schema to create tables - -#### version\_table\_name - -Normalized name of the version table - -#### loads\_table\_name - -Normalized name of the loads table - -#### state\_table\_name - -Normalized name of the dlt state table - -#### coerce\_row - -```python -def coerce_row(table_name: str, parent_table: str, - row: StrAny) -> Tuple[DictStrAny, TPartialTableSchema] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L154) - -Fits values of fields present in `row` into a schema of `table_name`. Will coerce values into data types and infer new tables and column schemas. - -Method expects that field names in row are already normalized. -* if table schema for `table_name` does not exist, new table is created -* if column schema for a field in `row` does not exist, it is inferred from data -* if incomplete column schema (no data type) exists, column is inferred from data and existing hints are applied -* fields with None value are removed - -Returns tuple with row with coerced values and a partial table containing just the newly added columns or None if no changes were detected - -#### bump\_version - -```python -def bump_version() -> Tuple[int, str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L209) - -Computes schema hash in order to check if schema content was modified. In such case the schema ``stored_version`` and ``stored_version_hash`` are updated. - -Should not be used in production code. The method ``to_dict`` will generate TStoredSchema with correct value, only once before persisting schema to storage. - -**Returns**: - - Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple - -#### normalize\_table\_identifiers - -```python -def normalize_table_identifiers(table: TTableSchema) -> TTableSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L253) - -Normalizes all table and column names in `table` schema according to current schema naming convention and returns -new normalized TTableSchema instance. - -Naming convention like snake_case may produce name clashes with the column names. Clashing column schemas are merged -where the column that is defined later in the dictionary overrides earlier column. - -Note that resource name is not normalized. - -#### get\_new\_table\_columns - -```python -def get_new_table_columns( - table_name: str, - exiting_columns: TTableSchemaColumns, - include_incomplete: bool = False) -> List[TColumnSchema] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L282) - -Gets new columns to be added to `exiting_columns` to bring them up to date with `table_name` schema. Optionally includes incomplete columns (without data type) - -#### get\_table\_columns - -```python -def get_table_columns(table_name: str, - include_incomplete: bool = False) -> TTableSchemaColumns -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L294) - -Gets columns of `table_name`. Optionally includes incomplete columns - -#### data\_tables - -```python -def data_tables(include_incomplete: bool = False) -> List[TTableSchema] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L301) - -Gets list of all tables, that hold the loaded data. Excludes dlt tables. Excludes incomplete tables (ie. without columns) - -#### dlt\_tables - -```python -def dlt_tables() -> List[TTableSchema] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L305) - -Gets dlt tables - -#### version - -```python -@property -def version() -> int -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L313) - -Version of the schema content that takes into account changes from the time of schema loading/creation. -The stored version is increased by one if content was modified - -**Returns**: - -- `int` - Current schema version - -#### stored\_version - -```python -@property -def stored_version() -> int -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L323) - -Version of the schema content form the time of schema loading/creation. - -**Returns**: - -- `int` - Stored schema version - -#### version\_hash - -```python -@property -def version_hash() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L332) - -Current version hash of the schema, recomputed from the actual content - -#### stored\_version\_hash - -```python -@property -def stored_version_hash() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L337) - -Version hash of the schema content form the time of schema loading/creation. - -#### tables - -```python -@property -def tables() -> TSchemaTables -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L346) - -Dictionary of schema tables - -#### clone - -```python -def clone(update_normalizers: bool = False) -> "Schema" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L362) - -Make a deep copy of the schema, possibly updating normalizers and identifiers in the schema if `update_normalizers` is True - -#### update\_normalizers - -```python -def update_normalizers() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/schema.py#L371) - -Looks for new normalizer configuration or for destination capabilities context and updates all identifiers in the schema - diff --git a/docs/website/docs/api_reference/common/schema/typing.md b/docs/website/docs/api_reference/common/schema/typing.md deleted file mode 100644 index e90f4f315e..0000000000 --- a/docs/website/docs/api_reference/common/schema/typing.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -sidebar_label: typing -title: common.schema.typing ---- - -#### TColumnProp - -Known properties and hints of the column - -#### TColumnHint - -Known hints of a column used to declare hint regexes. - -#### TColumnNames - -A string representing a column name or a list of - -## TColumnSchemaBase Objects - -```python -class TColumnSchemaBase(TypedDict) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/typing.py#L37) - -TypedDict that defines basic properties of a column: name, data type and nullable - -## TColumnSchema Objects - -```python -class TColumnSchema(TColumnSchemaBase) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/typing.py#L44) - -TypedDict that defines additional column hints - -#### TTableSchemaColumns - -A mapping from column name to column schema, typically part of a table schema - -## TTableSchema Objects - -```python -class TTableSchema(TypedDict) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/typing.py#L74) - -TypedDict that defines properties of a table - -## TStoredSchema Objects - -```python -class TStoredSchema(TypedDict) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/typing.py#L100) - -TypeDict defining the schema representation in storage - diff --git a/docs/website/docs/api_reference/common/schema/utils.md b/docs/website/docs/api_reference/common/schema/utils.md deleted file mode 100644 index 25fbc46aec..0000000000 --- a/docs/website/docs/api_reference/common/schema/utils.md +++ /dev/null @@ -1,213 +0,0 @@ ---- -sidebar_label: utils -title: common.schema.utils ---- - -#### is\_valid\_schema\_name - -```python -def is_valid_schema_name(name: str) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L31) - -Schema name must be a valid python identifier and have max len of 64 - -#### normalize\_schema\_name - -```python -def normalize_schema_name(name: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L36) - -Normalizes schema name by using snake case naming convention. The maximum length is 64 characters - -#### apply\_defaults - -```python -def apply_defaults(stored_schema: TStoredSchema) -> TStoredSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L42) - -Applies default hint values to `stored_schema` in place - -Updates only complete column hints, incomplete columns are preserved intact - -#### remove\_defaults - -```python -def remove_defaults(stored_schema: TStoredSchema) -> TStoredSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L66) - -Removes default values from `stored_schema` in place, returns the input for chaining - -Default values are removed from table schemas and complete column schemas. Incomplete columns are preserved intact. - -#### has\_default\_column\_hint\_value - -```python -def has_default_column_hint_value(hint: str, value: Any) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L89) - -Checks if `value` is a default for `hint`. Only known column hints (COLUMN_HINTS) are checked - -#### remove\_column\_defaults - -```python -def remove_column_defaults(column_schema: TColumnSchema) -> TColumnSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L99) - -Removes default values from `column_schema` in place, returns the input for chaining - -#### add\_column\_defaults - -```python -def add_column_defaults(column: TColumnSchemaBase) -> TColumnSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L110) - -Adds default boolean hints to column - -#### compile\_simple\_regexes - -```python -def compile_simple_regexes(r: Iterable[TSimpleRegex]) -> REPattern -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L234) - -Compile multiple patterns as one - -#### is\_complete\_column - -```python -def is_complete_column(col: TColumnSchemaBase) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L362) - -Returns true if column contains enough data to be created at the destination. Must contain a name and a data type. Other hints have defaults. - -#### compare\_complete\_columns - -```python -def compare_complete_columns(a: TColumnSchema, b: TColumnSchema) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L367) - -Compares mandatory fields of complete columns - -#### merge\_columns - -```python -def merge_columns(col_a: TColumnSchema, - col_b: TColumnSchema, - merge_defaults: bool = True) -> TColumnSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L374) - -Merges `col_b` into `col_a`. if `merge_defaults` is True, only hints from `col_b` that are not default in `col_a` will be set. - -Modifies col_a in place and returns it - -#### diff\_tables - -```python -def diff_tables(tab_a: TTableSchema, - tab_b: TPartialTableSchema) -> TPartialTableSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L386) - -Creates a partial table that contains properties found in `tab_b` that are not present or different in `tab_a`. -The name is always present in returned partial. -It returns new columns (not present in tab_a) and merges columns from tab_b into tab_a (overriding non-default hint values). -If any columns are returned they contain full data (not diffs of columns) - -Raises SchemaException if tables cannot be merged -* when columns with the same name have different data types -* when table links to different parent tables - -#### merge\_tables - -```python -def merge_tables(table: TTableSchema, - partial_table: TPartialTableSchema) -> TPartialTableSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L452) - -Merges "partial_table" into "table". `table` is merged in place. Returns the diff partial table. - -`table` and `partial_table` names must be identical. A table diff is generated and applied to `table`: -* new columns are added, updated columns are replaced from diff -* table hints are added or replaced from diff -* nothing gets deleted - -#### get\_write\_disposition - -```python -def get_write_disposition(tables: TSchemaTables, - table_name: str) -> TWriteDisposition -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L496) - -Returns write disposition of a table if present. If not, looks up into parent table - -#### table\_schema\_has\_type - -```python -def table_schema_has_type(table: TTableSchema, _typ: TDataType) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L510) - -Checks if `table` schema contains column with type _typ - -#### get\_top\_level\_table - -```python -def get_top_level_table(tables: TSchemaTables, - table_name: str) -> TTableSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L515) - -Finds top level (without parent) of a `table_name` following the ancestry hierarchy. - -#### get\_child\_tables - -```python -def get_child_tables(tables: TSchemaTables, - table_name: str) -> List[TTableSchema] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L524) - -Get child tables for table name and return a list of tables ordered by ancestry so the child tables are always after their parents - -#### group\_tables\_by\_resource - -```python -def group_tables_by_resource( - tables: TSchemaTables, - pattern: Optional[REPattern] = None) -> Dict[str, List[TTableSchema]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/schema/utils.py#L539) - -Create a dict of resources and their associated tables and descendant tables -If `pattern` is supplied, the result is filtered to only resource names matching the pattern. - diff --git a/docs/website/docs/api_reference/common/source.md b/docs/website/docs/api_reference/common/source.md deleted file mode 100644 index 6fcaaf99ed..0000000000 --- a/docs/website/docs/api_reference/common/source.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -sidebar_label: source -title: common.source ---- - -## SourceInfo Objects - -```python -class SourceInfo(NamedTuple) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/source.py#L11) - -Runtime information on the source/resource - -#### set\_current\_pipe\_name - -```python -def set_current_pipe_name(name: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/source.py#L25) - -Set pipe name in current thread - -#### unset\_current\_pipe\_name - -```python -def unset_current_pipe_name() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/source.py#L30) - -Unset pipe name in current thread - -#### get\_current\_pipe\_name - -```python -def get_current_pipe_name() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/source.py#L35) - -Gets pipe name associated with current thread - diff --git a/docs/website/docs/api_reference/common/storages/configuration.md b/docs/website/docs/api_reference/common/storages/configuration.md deleted file mode 100644 index 0d1de313ba..0000000000 --- a/docs/website/docs/api_reference/common/storages/configuration.md +++ /dev/null @@ -1,113 +0,0 @@ ---- -sidebar_label: configuration -title: common.storages.configuration ---- - -## SchemaStorageConfiguration Objects - -```python -@configspec -class SchemaStorageConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/configuration.py#L15) - -#### schema\_volume\_path - -path to volume with default schemas - -#### import\_schema\_path - -the import schema from external location - -#### export\_schema\_path - -the export schema to external location - -#### external\_schema\_format - -format in which to expect external schema - -#### external\_schema\_format\_remove\_defaults - -remove default values when exporting schema - -## NormalizeStorageConfiguration Objects - -```python -@configspec -class NormalizeStorageConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/configuration.py#L28) - -#### normalize\_volume\_path - -path to volume where normalized loader files will be stored - -## LoadStorageConfiguration Objects - -```python -@configspec -class LoadStorageConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/configuration.py#L37) - -#### load\_volume\_path - -path to volume where files to be loaded to analytical storage are stored - -#### delete\_completed\_jobs - -if set to true the folder with completed jobs will be deleted - -## FilesystemConfiguration Objects - -```python -@configspec -class FilesystemConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/configuration.py#L49) - -A configuration defining filesystem location and access credentials. - -When configuration is resolved, `bucket_url` is used to extract a protocol and request corresponding credentials class. -* s3 -* gs, gcs -* az, abfs, adl -* file, memory -* gdrive - -#### protocol - -```python -@property -def protocol() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/configuration.py#L74) - -`bucket_url` protocol - -#### fingerprint - -```python -def fingerprint() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/configuration.py#L93) - -Returns a fingerprint of bucket_url - -#### \_\_str\_\_ - -```python -def __str__() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/configuration.py#L99) - -Return displayable destination location - diff --git a/docs/website/docs/api_reference/common/storages/file_storage.md b/docs/website/docs/api_reference/common/storages/file_storage.md deleted file mode 100644 index 71dd9676e9..0000000000 --- a/docs/website/docs/api_reference/common/storages/file_storage.md +++ /dev/null @@ -1,102 +0,0 @@ ---- -sidebar_label: file_storage -title: common.storages.file_storage ---- - -## FileStorage Objects - -```python -class FileStorage() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/file_storage.py#L17) - -#### list\_folder\_files - -```python -def list_folder_files(relative_path: str, to_root: bool = True) -> List[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/file_storage.py#L109) - -List all files in ``relative_path`` folder - -**Arguments**: - -- `relative_path` _str_ - A path to folder, relative to storage root -- `to_root` _bool, optional_ - If True returns paths to files in relation to root, if False, returns just file names. Defaults to True. - - -**Returns**: - -- `List[str]` - A list of file names with optional path as per ``to_root`` parameter - -#### atomic\_rename - -```python -def atomic_rename(from_relative_path: str, to_relative_path: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/file_storage.py#L147) - -Renames a path using os.rename which is atomic on POSIX, Windows and NFS v4. - -Method falls back to non-atomic method in following cases: -1. On Windows when destination file exists -2. If underlying file system does not support atomic rename -3. All buckets mapped with FUSE are not atomic - -#### rename\_tree - -```python -def rename_tree(from_relative_path: str, to_relative_path: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/file_storage.py#L161) - -Renames a tree using os.rename if possible making it atomic - -If we get 'too many open files': in that case `rename_tree_files is used - -#### rename\_tree\_files - -```python -def rename_tree_files(from_relative_path: str, to_relative_path: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/file_storage.py#L175) - -Renames files in a tree recursively using os.rename. - -#### atomic\_import - -```python -def atomic_import(external_file_path: str, to_folder: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/file_storage.py#L198) - -Moves a file at `external_file_path` into the `to_folder` effectively importing file into storage - -#### open\_zipsafe\_ro - -```python -@staticmethod -def open_zipsafe_ro(path: str, mode: str = "r", **kwargs: Any) -> IO[Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/file_storage.py#L259) - -Opens a file using gzip.open if it is a gzip file, otherwise uses open. - -#### is\_gzipped - -```python -@staticmethod -def is_gzipped(path: str) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/file_storage.py#L276) - -Checks if file under path is gzipped by reading a header - diff --git a/docs/website/docs/api_reference/common/storages/filesystem.md b/docs/website/docs/api_reference/common/storages/filesystem.md deleted file mode 100644 index b16644ed0f..0000000000 --- a/docs/website/docs/api_reference/common/storages/filesystem.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -sidebar_label: filesystem -title: common.storages.filesystem ---- - -## FileItem Objects - -```python -class FileItem(TypedDict) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/filesystem.py#L16) - -A DataItem representing a file - -#### filesystem - -```python -def filesystem( - protocol: str, - credentials: FileSystemCredentials = None -) -> Tuple[AbstractFileSystem, str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/filesystem.py#L41) - -Instantiates an authenticated fsspec `FileSystem` for a given `protocol` and credentials. - -Please supply credentials instance corresponding to the protocol - -#### filesystem\_from\_config - -```python -def filesystem_from_config( - config: FilesystemConfiguration) -> Tuple[AbstractFileSystem, str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/filesystem.py#L50) - -Instantiates an authenticated fsspec `FileSystem` from `config` argument. - -Authenticates following filesystems: -* s3 -* az, abfs -* gcs, gs - -All other filesystems are not authenticated - -Returns: (fsspec filesystem, normalized url) - diff --git a/docs/website/docs/api_reference/common/storages/load_storage.md b/docs/website/docs/api_reference/common/storages/load_storage.md deleted file mode 100644 index f57bdd56af..0000000000 --- a/docs/website/docs/api_reference/common/storages/load_storage.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -sidebar_label: load_storage -title: common.storages.load_storage ---- - -## LoadStorage Objects - -```python -class LoadStorage(DataItemStorage, VersionedStorage) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/load_storage.py#L125) - -#### NORMALIZED\_FOLDER - -folder within the volume where load packages are stored - -#### LOADED\_FOLDER - -folder to keep the loads that were completely processed - -#### SCHEMA\_UPDATES\_FILE\_NAME - -updates to the tables in schema created by normalizer - -#### APPLIED\_SCHEMA\_UPDATES\_FILE\_NAME - -updates applied to the destination - -#### SCHEMA\_FILE\_NAME - -package schema - -#### PACKAGE\_COMPLETED\_FILE\_NAME - -completed package marker file, currently only to store data with os.stat - -#### list\_failed\_jobs\_in\_completed\_package - -```python -def list_failed_jobs_in_completed_package( - load_id: str) -> Sequence[LoadJobInfo] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/load_storage.py#L246) - -List all failed jobs and associated error messages for a completed load package with `load_id` - -#### get\_load\_package\_info - -```python -def get_load_package_info(load_id: str) -> LoadPackageInfo -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/load_storage.py#L258) - -Gets information on normalized/completed package with given load_id, all jobs and their statuses. - -#### commit\_schema\_update - -```python -def commit_schema_update(load_id: str, applied_update: TSchemaTables) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/load_storage.py#L300) - -Marks schema update as processed and stores the update that was applied at the destination - -#### add\_new\_job - -```python -def add_new_job(load_id: str, - job_file_path: str, - job_state: TJobState = "new_jobs") -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/load_storage.py#L310) - -Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id` - diff --git a/docs/website/docs/api_reference/common/storages/normalize_storage.md b/docs/website/docs/api_reference/common/storages/normalize_storage.md deleted file mode 100644 index f9119d1527..0000000000 --- a/docs/website/docs/api_reference/common/storages/normalize_storage.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -sidebar_label: normalize_storage -title: common.storages.normalize_storage ---- - -## NormalizeStorage Objects - -```python -class NormalizeStorage(VersionedStorage) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/normalize_storage.py#L17) - -#### EXTRACTED\_FOLDER - -folder within the volume where extracted files to be normalized are stored - diff --git a/docs/website/docs/api_reference/common/storages/transactional_file.md b/docs/website/docs/api_reference/common/storages/transactional_file.md deleted file mode 100644 index a95a45978c..0000000000 --- a/docs/website/docs/api_reference/common/storages/transactional_file.md +++ /dev/null @@ -1,180 +0,0 @@ ---- -sidebar_label: transactional_file -title: common.storages.transactional_file ---- - -Transactional file system operations. - -The lock implementation allows for multiple readers and a single writer. -It can be used to operate on a single file atomically both locally and on -cloud storage. The lock can be used to operate on entire directories by -creating a lock file that resolves to an agreed upon path across processes. - -#### lock\_id - -```python -def lock_id(k: int = 4) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L22) - -Generate a time based random id. - -**Arguments**: - -- `k` - The length of the suffix after the timestamp. - - -**Returns**: - - A time sortable uuid. - -## Heartbeat Objects - -```python -class Heartbeat(Timer) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L35) - -A thread designed to periodically execute a fn. - -## TransactionalFile Objects - -```python -class TransactionalFile() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L45) - -A transaction handler which wraps a file path. - -#### \_\_init\_\_ - -```python -def __init__(path: str, fs: fsspec.AbstractFileSystem) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L51) - -Creates a new FileTransactionHandler. - -**Arguments**: - -- `path` - The path to lock. -- `fs` - The fsspec file system. - -#### read - -```python -def read() -> t.Optional[bytes] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L116) - -Reads data from the file if it exists. - -#### write - -```python -def write(content: bytes) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L122) - -Writes data within the transaction. - -#### rollback - -```python -def rollback() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L130) - -Rolls back the transaction. - -#### acquire\_lock - -```python -def acquire_lock(blocking: bool = True, - timeout: float = -1, - jitter_mean: float = 0) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L139) - -Acquires a lock on a path. Mimics the stdlib's `threading.Lock` interface. - -Acquire a lock, blocking or non-blocking. - -When invoked with the blocking argument set to True (the default), block until -the lock is unlocked, then set it to locked and return True. - -When invoked with the blocking argument set to False, do not block. If a call -with blocking set to True would block, return False immediately; otherwise, set -the lock to locked and return True. - -When invoked with the floating-point timeout argument set to a positive value, -block for at most the number of seconds specified by timeout and as long as the -lock cannot be acquired. A timeout argument of -1 specifies an unbounded wait. -If blocking is False, timeout is ignored. The stdlib would raise a ValueError. - -If you expect a large concurrency on the lock, you can set the jitter_mean to a -value > 0. This will introduce a short random gap before locking procedure -starts. - -The return value is True if the lock is acquired successfully, False if -not (for example if the timeout expired). - -#### release\_lock - -```python -def release_lock() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L192) - -Releases a lock on a key. - -This is idempotent and safe to call multiple times. - -#### lock - -```python -@contextmanager -def lock(timeout: t.Optional[float] = None) -> t.Iterator[None] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L204) - -A context manager that acquires and releases a lock on a path. - -This is a convenience method for acquiring a lock and reading the contents of -the file. It will release the lock when the context manager exits. This is -useful for reading a file and then writing it back out as a transaction. If -the lock cannot be acquired, this will raise a RuntimeError. If the lock is -acquired, the contents of the file will be returned. If the file does not -exist, None will be returned. If an exception is raised within the context -manager, the transaction will be rolled back. - -**Arguments**: - -- `timeout` - The timeout for acquiring the lock. If None, this will use the - default timeout. A timeout of -1 will wait indefinitely. - - -**Raises**: - -- `RuntimeError` - If the lock cannot be acquired. - -#### \_\_del\_\_ - -```python -def __del__() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/storages/transactional_file.py#L234) - -Stop the heartbeat thread on gc. Locks should be released explicitly. - diff --git a/docs/website/docs/api_reference/common/time.md b/docs/website/docs/api_reference/common/time.md deleted file mode 100644 index 728d289f09..0000000000 --- a/docs/website/docs/api_reference/common/time.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -sidebar_label: time -title: common.time ---- - -#### timestamp\_within - -```python -def timestamp_within(timestamp: float, min_exclusive: Optional[float], - max_inclusive: Optional[float]) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/time.py#L16) - -check if timestamp within range uniformly treating none and range inclusiveness - -#### timestamp\_before - -```python -def timestamp_before(timestamp: float, max_inclusive: Optional[float]) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/time.py#L23) - -check if timestamp is before max timestamp, inclusive - -#### ensure\_pendulum\_date - -```python -def ensure_pendulum_date(value: TAnyDateTime) -> pendulum.Date -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/time.py#L48) - -Coerce a date/time value to a `pendulum.Date` object. - -UTC is assumed if the value is not timezone aware. Other timezones are shifted to UTC - -**Arguments**: - -- `value` - The value to coerce. Can be a pendulum.DateTime, pendulum.Date, datetime, date or iso date/time str. - - -**Returns**: - - A timezone aware pendulum.Date object. - -#### ensure\_pendulum\_datetime - -```python -def ensure_pendulum_datetime(value: TAnyDateTime) -> pendulum.DateTime -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/time.py#L75) - -Coerce a date/time value to a `pendulum.DateTime` object. - -UTC is assumed if the value is not timezone aware. Other timezones are shifted to UTC - -**Arguments**: - -- `value` - The value to coerce. Can be a pendulum.DateTime, pendulum.Date, datetime, date or iso date/time str. - - -**Returns**: - - A timezone aware pendulum.DateTime object in UTC timezone. - -#### ensure\_pendulum\_time - -```python -def ensure_pendulum_time(value: Union[str, datetime.time]) -> pendulum.Time -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/time.py#L102) - -Coerce a time value to a `pendulum.Time` object. - -**Arguments**: - -- `value` - The value to coerce. Can be a `pendulum.Time` / `datetime.time` or an iso time string. - - -**Returns**: - - A pendulum.Time object - diff --git a/docs/website/docs/api_reference/common/typing.md b/docs/website/docs/api_reference/common/typing.md deleted file mode 100644 index 31c820038b..0000000000 --- a/docs/website/docs/api_reference/common/typing.md +++ /dev/null @@ -1,131 +0,0 @@ ---- -sidebar_label: typing -title: common.typing ---- - -#### StrAny - -immutable, covariant entity - -#### StrStr - -immutable, covariant entity - -#### StrStrStr - -immutable, covariant entity - -#### TFun - -any function - -#### TSecretValue - -type: ignore - -#### TSecretStrValue - -type: ignore - -#### TDataItem - -A single data item as extracted from data source - -#### TDataItems - -A single data item or a list as extracted from the data source - -#### TAnyDateTime - -DateTime represented as pendulum/python object, ISO string or unix timestamp - -#### ConfigValue - -value of type None indicating argument that may be injected by config provider - -## SupportsVariant Objects - -```python -@runtime_checkable -class SupportsVariant(Protocol, Generic[TVariantBase]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/typing.py#L49) - -Defines variant type protocol that should be recognized by normalizers - -Variant types behave like TVariantBase type (ie. Decimal) but also implement the protocol below that is used to extract the variant value from it. -See `Wei` type declaration which returns Decimal or str for values greater than supported by destination warehouse. - -## SupportsHumanize Objects - -```python -class SupportsHumanize(Protocol) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/typing.py#L59) - -#### asdict - -```python -def asdict() -> DictStrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/typing.py#L60) - -Represents object as dict with a schema loadable by dlt - -#### asstr - -```python -def asstr(verbosity: int = 0) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/typing.py#L64) - -Represents object as human readable string - -#### extract\_inner\_type - -```python -def extract_inner_type(hint: Type[Any], - preserve_new_types: bool = False) -> Type[Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/typing.py#L111) - -Gets the inner type from Literal, Optional, Final and NewType - -**Arguments**: - -- `hint` _Type[Any]_ - Type to extract -- `preserve_new_types` _bool_ - Do not extract supertype of a NewType - - -**Returns**: - -- `Type[Any]` - Inner type if hint was Literal, Optional or NewType, otherwise hint - -#### get\_generic\_type\_argument\_from\_instance - -```python -def get_generic_type_argument_from_instance( - instance: Any, sample_value: Optional[Any]) -> Type[Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/typing.py#L137) - -Infers type argument of a Generic class from an `instance` of that class using optional `sample_value` of the argument type - -Inference depends on the presence of __orig_class__ attribute in instance, if not present - sample_Value will be used - -**Arguments**: - -- `instance` _Any_ - instance of Generic class -- `sample_value` _Optional[Any]_ - instance of type of generic class, optional - - -**Returns**: - -- `Type[Any]` - type argument or Any if not known - diff --git a/docs/website/docs/api_reference/common/utils.md b/docs/website/docs/api_reference/common/utils.md deleted file mode 100644 index c30d356560..0000000000 --- a/docs/website/docs/api_reference/common/utils.md +++ /dev/null @@ -1,246 +0,0 @@ ---- -sidebar_label: utils -title: common.utils ---- - -#### uniq\_id - -```python -def uniq_id(len_: int = 16) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L30) - -Returns a hex encoded crypto-grade string of random bytes with desired len_ - -#### uniq\_id\_base64 - -```python -def uniq_id_base64(len_: int = 16) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L35) - -Returns a base64 encoded crypto-grade string of random bytes with desired len_ - -#### digest128 - -```python -def digest128(v: str, len_: int = 15) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L40) - -Returns a base64 encoded shake128 hash of str `v` with digest of length `len_` (default: 15 bytes = 20 characters length) - -#### digest128b - -```python -def digest128b(v: bytes, len_: int = 15) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L45) - -Returns a base64 encoded shake128 hash of bytes `v` with digest of length `len_` (default: 15 bytes = 20 characters length) - -#### flatten\_list\_of\_str\_or\_dicts - -```python -def flatten_list_of_str_or_dicts(seq: Sequence[Union[StrAny, str]]) -> StrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L80) - -Transforms a list of objects or strings [{K: {...}}, L, ...] -> {K: {...}, L: None, ...} - -#### concat\_strings\_with\_limit - -```python -def concat_strings_with_limit(strings: List[str], separator: str, - limit: int) -> Iterator[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L143) - -Generator function to concatenate strings. - -The function takes a list of strings and concatenates them into a single string such that the length of each -concatenated string does not exceed a specified limit. It yields each concatenated string as it is created. -The strings are separated by a specified separator. - -**Arguments**: - -- `strings` _List[str]_ - The list of strings to be concatenated. -- `separator` _str_ - The separator to use between strings. Defaults to a single space. -- `limit` _int_ - The maximum length for each concatenated string. - - -**Yields**: - - Generator[str, None, None]: A generator that yields each concatenated string. - -#### graph\_edges\_to\_nodes - -```python -def graph_edges_to_nodes(edges: Sequence[Tuple[TAny, TAny]], - directed: bool = True) -> Dict[TAny, Set[TAny]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L178) - -Converts a directed graph represented as a sequence of edges to a graph represented as a mapping from nodes a set of connected nodes. - -Isolated nodes are represented as edges to itself. If `directed` is `False`, each edge is duplicated but going in opposite direction. - -#### graph\_find\_scc\_nodes - -```python -def graph_find_scc_nodes(undag: Dict[TAny, Set[TAny]]) -> List[Set[TAny]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L197) - -Finds and returns a list of sets of nodes in strongly connected components of a `undag` which is undirected - -To obtain undirected graph from edges use `graph_edges_to_nodes` function with `directed` argument `False`. - -#### update\_dict\_with\_prune - -```python -def update_dict_with_prune(dest: DictStrAny, update: StrAny) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L226) - -Updates values that are both in `dest` and `update` and deletes `dest` values that are None in `update` - -#### map\_nested\_in\_place - -```python -def map_nested_in_place(func: AnyFun, _complex: TAny) -> TAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L259) - -Applies `func` to all elements in `_dict` recursively, replacing elements in nested dictionaries and lists in place. - -#### is\_interactive - -```python -def is_interactive() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L284) - -Determine if the current environment is interactive. - -**Returns**: - -- `bool` - True if interactive (e.g., REPL, IPython, Jupyter Notebook), False if running as a script. - -#### custom\_environ - -```python -@contextmanager -def custom_environ(env: StrStr) -> Iterator[None] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L305) - -Temporarily set environment variables inside the context manager and -fully restore previous environment afterwards - -#### multi\_context\_manager - -```python -@contextmanager -def multi_context_manager( - managers: Sequence[ContextManager[Any]]) -> Iterator[Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L360) - -A context manager holding several other context managers. Enters and exists all of them. Yields from the last in the list - -#### is\_inner\_callable - -```python -def is_inner_callable(f: AnyFun) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L388) - -Checks if f is defined within other function - -#### get\_module\_name - -```python -def get_module_name(m: ModuleType) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L402) - -Gets module name from module with a fallback for executing module __main__ - -#### derives\_from\_class\_of\_name - -```python -def derives_from_class_of_name(o: object, name: str) -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L411) - -Checks if object o has class of name in its derivation tree - -#### compressed\_b64encode - -```python -def compressed_b64encode(value: bytes) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L417) - -Compress and b64 encode the given bytestring - -#### compressed\_b64decode - -```python -def compressed_b64decode(value: str) -> bytes -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L422) - -Decode a bytestring encoded with `compressed_b64encode` - -#### merge\_row\_count - -```python -def merge_row_count(row_counts_1: TRowCount, row_counts_2: TRowCount) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L436) - -merges row counts_2 into row_counts_1 - -#### extend\_list\_deduplicated - -```python -def extend_list_deduplicated(original_list: List[Any], - extending_list: Iterable[Any]) -> List[Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L443) - -extends the first list by the second, but does not add duplicates - -#### maybe\_context - -```python -@contextmanager -def maybe_context(manager: ContextManager[TAny]) -> Iterator[TAny] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/utils.py#L453) - -Allows context manager `manager` to be None by creating dummy context. Otherwise `manager` is used - diff --git a/docs/website/docs/api_reference/common/validation.md b/docs/website/docs/api_reference/common/validation.md deleted file mode 100644 index d68708a404..0000000000 --- a/docs/website/docs/api_reference/common/validation.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -sidebar_label: validation -title: common.validation ---- - -#### validate\_dict - -```python -def validate_dict(spec: Type[_TypedDict], - doc: StrAny, - path: str, - filter_f: TFilterFunc = None, - validator_f: TCustomValidator = None) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/common/validation.py#L12) - -Validate the `doc` dictionary based on the given typed dictionary specification `spec`. - -**Arguments**: - -- `spec` _Type[_TypedDict]_ - The typed dictionary that `doc` should conform to. -- `doc` _StrAny_ - The dictionary to validate. -- `path` _str_ - The string representing the location of the dictionary - in a hierarchical data structure. -- `filter_f` _TFilterFunc, optional_ - A function to filter keys in `doc`. It should - return `True` for keys to be kept. Defaults to a function that keeps all keys. -- `validator_f` _TCustomValidator, optional_ - A function to perform additional validation - for types not covered by this function. It should return `True` if the validation passes. - Defaults to a function that rejects all such types. - - -**Raises**: - -- `DictValidationException` - If there are missing required fields, unexpected fields, - type mismatches or unvalidated types in `doc` compared to `spec`. - - -**Returns**: - - None - diff --git a/docs/website/docs/api_reference/destinations/athena/athena.md b/docs/website/docs/api_reference/destinations/athena/athena.md deleted file mode 100644 index 1c4de72b90..0000000000 --- a/docs/website/docs/api_reference/destinations/athena/athena.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -sidebar_label: athena -title: destinations.athena.athena ---- - -## DoNothingJob Objects - -```python -class DoNothingJob(LoadJob) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/athena/athena.py#L96) - -The most lazy class of dlt - -## AthenaClient Objects - -```python -class AthenaClient(SqlJobClientBase) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/athena/athena.py#L251) - -#### start\_file\_load - -```python -def start_file_load(table: TTableSchema, file_path: str, - load_id: str) -> LoadJob -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/athena/athena.py#L322) - -Starts SqlLoadJob for files ending with .sql or returns None to let derived classes to handle their specific jobs - diff --git a/docs/website/docs/api_reference/destinations/athena/configuration.md b/docs/website/docs/api_reference/destinations/athena/configuration.md deleted file mode 100644 index 554b505035..0000000000 --- a/docs/website/docs/api_reference/destinations/athena/configuration.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.athena.configuration ---- - -## AthenaClientConfiguration Objects - -```python -@configspec -class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/athena/configuration.py#L9) - -#### destination\_name - -type: ignore[misc] - diff --git a/docs/website/docs/api_reference/destinations/bigquery/bigquery.md b/docs/website/docs/api_reference/destinations/bigquery/bigquery.md deleted file mode 100644 index 12fd5cb088..0000000000 --- a/docs/website/docs/api_reference/destinations/bigquery/bigquery.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -sidebar_label: bigquery -title: destinations.bigquery.bigquery ---- - -## BigQueryClient Objects - -```python -class BigQueryClient(SqlJobClientWithStaging) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/bigquery/bigquery.py#L134) - -#### restore\_file\_load - -```python -def restore_file_load(file_path: str) -> LoadJob -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/bigquery/bigquery.py#L156) - -Returns a completed SqlLoadJob or restored BigQueryLoadJob - -See base class for details on SqlLoadJob. BigQueryLoadJob is restored with job id derived from `file_path` - -**Arguments**: - -- `file_path` _str_ - a path to a job file - - -**Returns**: - -- `LoadJob` - completed SqlLoadJob or restored BigQueryLoadJob - diff --git a/docs/website/docs/api_reference/destinations/bigquery/configuration.md b/docs/website/docs/api_reference/destinations/bigquery/configuration.md deleted file mode 100644 index b35cb287f0..0000000000 --- a/docs/website/docs/api_reference/destinations/bigquery/configuration.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.bigquery.configuration ---- - -## BigQueryClientConfiguration Objects - -```python -@configspec -class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration - ) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/bigquery/configuration.py#L12) - -#### http\_timeout - -connection timeout for http request to BigQuery api - -#### file\_upload\_timeout - -a timeout for file upload when loading local files - -#### retry\_deadline - -how long to retry the operation in case of error, the backoff 60s - -#### fingerprint - -```python -def fingerprint() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/bigquery/configuration.py#L31) - -Returns a fingerprint of project_id - diff --git a/docs/website/docs/api_reference/destinations/bigquery/sql_client.md b/docs/website/docs/api_reference/destinations/bigquery/sql_client.md deleted file mode 100644 index 12288b644c..0000000000 --- a/docs/website/docs/api_reference/destinations/bigquery/sql_client.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -sidebar_label: sql_client -title: destinations.bigquery.sql_client ---- - -## BigQueryDBApiCursorImpl Objects - -```python -class BigQueryDBApiCursorImpl(DBApiCursorImpl) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/bigquery/sql_client.py#L28) - -Use native BigQuery data frame support if available - -#### native\_cursor - -type: ignore - diff --git a/docs/website/docs/api_reference/destinations/duckdb/configuration.md b/docs/website/docs/api_reference/destinations/duckdb/configuration.md deleted file mode 100644 index b885a318c1..0000000000 --- a/docs/website/docs/api_reference/destinations/duckdb/configuration.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.duckdb.configuration ---- - -## DuckDbBaseCredentials Objects - -```python -@configspec -class DuckDbBaseCredentials(ConnectionStringCredentials) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/duckdb/configuration.py#L19) - -#### read\_only - -open database read/write - -## DuckDbCredentials Objects - -```python -@configspec -class DuckDbCredentials(DuckDbBaseCredentials) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/duckdb/configuration.py#L92) - -#### drivername - -type: ignore - -## DuckDbClientConfiguration Objects - -```python -@configspec -class DuckDbClientConfiguration(DestinationClientDwhWithStagingConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/duckdb/configuration.py#L178) - -#### destination\_name - -type: ignore - -#### create\_indexes - -should unique indexes be created, this slows loading down massively - diff --git a/docs/website/docs/api_reference/destinations/duckdb/sql_client.md b/docs/website/docs/api_reference/destinations/duckdb/sql_client.md deleted file mode 100644 index d122dbf143..0000000000 --- a/docs/website/docs/api_reference/destinations/duckdb/sql_client.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -sidebar_label: sql_client -title: destinations.duckdb.sql_client ---- - -## DuckDBDBApiCursorImpl Objects - -```python -class DuckDBDBApiCursorImpl(DBApiCursorImpl) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/duckdb/sql_client.py#L15) - -Use native BigQuery data frame support if available - -#### native\_cursor - -type: ignore - diff --git a/docs/website/docs/api_reference/destinations/dummy/dummy.md b/docs/website/docs/api_reference/destinations/dummy/dummy.md deleted file mode 100644 index 872e144f8a..0000000000 --- a/docs/website/docs/api_reference/destinations/dummy/dummy.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -sidebar_label: dummy -title: destinations.dummy.dummy ---- - -## DummyClient Objects - -```python -class DummyClient(JobClientBase) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/dummy/dummy.py#L72) - -dummy client storing jobs in memory - -#### create\_table\_chain\_completed\_followup\_jobs - -```python -def create_table_chain_completed_followup_jobs( - table_chain: Sequence[TTableSchema]) -> List[NewLoadJob] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/dummy/dummy.py#L115) - -Creates a list of followup jobs that should be executed after a table chain is completed - diff --git a/docs/website/docs/api_reference/destinations/filesystem/configuration.md b/docs/website/docs/api_reference/destinations/filesystem/configuration.md deleted file mode 100644 index 4f47f948f8..0000000000 --- a/docs/website/docs/api_reference/destinations/filesystem/configuration.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.filesystem.configuration ---- - -## FilesystemDestinationClientConfiguration Objects - -```python -@configspec -class FilesystemDestinationClientConfiguration( - FilesystemConfiguration, DestinationClientStagingConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/filesystem/configuration.py#L11) - -#### destination\_name - -type: ignore - diff --git a/docs/website/docs/api_reference/destinations/filesystem/filesystem.md b/docs/website/docs/api_reference/destinations/filesystem/filesystem.md deleted file mode 100644 index 3966580597..0000000000 --- a/docs/website/docs/api_reference/destinations/filesystem/filesystem.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -sidebar_label: filesystem -title: destinations.filesystem.filesystem ---- - -## FilesystemClient Objects - -```python -class FilesystemClient(JobClientBase) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/filesystem/filesystem.py#L71) - -filesystem client storing jobs in memory - diff --git a/docs/website/docs/api_reference/destinations/insert_job_client.md b/docs/website/docs/api_reference/destinations/insert_job_client.md deleted file mode 100644 index 5ec1f30c6d..0000000000 --- a/docs/website/docs/api_reference/destinations/insert_job_client.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -sidebar_label: insert_job_client -title: destinations.insert_job_client ---- - -## InsertValuesJobClient Objects - -```python -class InsertValuesJobClient(SqlJobClientWithStaging) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/insert_job_client.py#L92) - -#### restore\_file\_load - -```python -def restore_file_load(file_path: str) -> LoadJob -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/insert_job_client.py#L94) - -Returns a completed SqlLoadJob or InsertValuesJob - -Returns completed jobs as SqlLoadJob and InsertValuesJob executed atomically in start_file_load so any jobs that should be recreated are already completed. -Obviously the case of asking for jobs that were never created will not be handled. With correctly implemented loader that cannot happen. - -**Arguments**: - -- `file_path` _str_ - a path to a job file - - -**Returns**: - -- `LoadJob` - Always a restored job completed - diff --git a/docs/website/docs/api_reference/destinations/job_client_impl.md b/docs/website/docs/api_reference/destinations/job_client_impl.md deleted file mode 100644 index 12fcd3bc9c..0000000000 --- a/docs/website/docs/api_reference/destinations/job_client_impl.md +++ /dev/null @@ -1,96 +0,0 @@ ---- -sidebar_label: job_client_impl -title: destinations.job_client_impl ---- - -## SqlLoadJob Objects - -```python -class SqlLoadJob(LoadJob) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_client_impl.py#L35) - -A job executing sql statement, without followup trait - -## SqlJobClientBase Objects - -```python -class SqlJobClientBase(JobClientBase, WithStateSync) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_client_impl.py#L87) - -#### maybe\_ddl\_transaction - -```python -@contextlib.contextmanager -def maybe_ddl_transaction() -> Iterator[None] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_client_impl.py#L135) - -Begins a transaction if sql client supports it, otherwise works in auto commit - -#### create\_table\_chain\_completed\_followup\_jobs - -```python -def create_table_chain_completed_followup_jobs( - table_chain: Sequence[TTableSchema]) -> List[NewLoadJob] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_client_impl.py#L161) - -Creates a list of followup jobs for merge write disposition and staging replace strategies - -#### start\_file\_load - -```python -def start_file_load(table: TTableSchema, file_path: str, - load_id: str) -> LoadJob -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_client_impl.py#L173) - -Starts SqlLoadJob for files ending with .sql or returns None to let derived classes to handle their specific jobs - -#### restore\_file\_load - -```python -def restore_file_load(file_path: str) -> LoadJob -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_client_impl.py#L180) - -Returns a completed SqlLoadJob or None to let derived classes to handle their specific jobs - -Returns completed jobs as SqlLoadJob is executed atomically in start_file_load so any jobs that should be recreated are already completed. -Obviously the case of asking for jobs that were never created will not be handled. With correctly implemented loader that cannot happen. - -**Arguments**: - -- `file_path` _str_ - a path to a job file - - -**Returns**: - -- `LoadJob` - A restored job or none - -## SqlJobClientWithStaging Objects - -```python -class SqlJobClientWithStaging(SqlJobClientBase, WithStagingDataset) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_client_impl.py#L439) - -#### get\_stage\_dispositions - -```python -def get_stage_dispositions() -> List[TWriteDisposition] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_client_impl.py#L445) - -Returns a list of dispositions that require staging tables to be populated - diff --git a/docs/website/docs/api_reference/destinations/job_impl.md b/docs/website/docs/api_reference/destinations/job_impl.md deleted file mode 100644 index cddd72b2ee..0000000000 --- a/docs/website/docs/api_reference/destinations/job_impl.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -sidebar_label: job_impl -title: destinations.job_impl ---- - -## NewLoadJobImpl Objects - -```python -class NewLoadJobImpl(EmptyLoadJobWithoutFollowup, NewLoadJob) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_impl.py#L30) - -#### new\_file\_path - -```python -def new_file_path() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/job_impl.py#L37) - -Path to a newly created temporary job file - diff --git a/docs/website/docs/api_reference/destinations/motherduck/configuration.md b/docs/website/docs/api_reference/destinations/motherduck/configuration.md deleted file mode 100644 index 38f515d5a3..0000000000 --- a/docs/website/docs/api_reference/destinations/motherduck/configuration.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.motherduck.configuration ---- - -## MotherDuckCredentials Objects - -```python -@configspec -class MotherDuckCredentials(DuckDbBaseCredentials) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/motherduck/configuration.py#L15) - -#### drivername - -type: ignore - -#### read\_only - -open database read/write - -## MotherDuckClientConfiguration Objects - -```python -@configspec -class MotherDuckClientConfiguration( - DestinationClientDwhWithStagingConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/motherduck/configuration.py#L42) - -#### destination\_name - -type: ignore - -#### create\_indexes - -should unique indexes be created, this slows loading down massively - -#### fingerprint - -```python -def fingerprint() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/motherduck/configuration.py#L48) - -Returns a fingerprint of user access token - diff --git a/docs/website/docs/api_reference/destinations/mssql/configuration.md b/docs/website/docs/api_reference/destinations/mssql/configuration.md deleted file mode 100644 index b01c780d12..0000000000 --- a/docs/website/docs/api_reference/destinations/mssql/configuration.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.mssql.configuration ---- - -## MsSqlCredentials Objects - -```python -@configspec -class MsSqlCredentials(ConnectionStringCredentials) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/mssql/configuration.py#L14) - -#### drivername - -type: ignore - -## MsSqlClientConfiguration Objects - -```python -@configspec -class MsSqlClientConfiguration(DestinationClientDwhWithStagingConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/mssql/configuration.py#L76) - -#### destination\_name - -type: ignore - -#### fingerprint - -```python -def fingerprint() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/mssql/configuration.py#L82) - -Returns a fingerprint of host part of a connection string - diff --git a/docs/website/docs/api_reference/destinations/mssql/mssql.md b/docs/website/docs/api_reference/destinations/mssql/mssql.md deleted file mode 100644 index 7fde91a1e7..0000000000 --- a/docs/website/docs/api_reference/destinations/mssql/mssql.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -sidebar_label: mssql -title: destinations.mssql.mssql ---- - -## MsSqlMergeJob Objects - -```python -class MsSqlMergeJob(SqlMergeJob) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/mssql/mssql.py#L68) - -#### gen\_key\_table\_clauses - -```python -@classmethod -def gen_key_table_clauses(cls, root_table_name: str, - staging_root_table_name: str, - key_clauses: Sequence[str], - for_delete: bool) -> List[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/mssql/mssql.py#L70) - -Generate sql clauses that may be used to select or delete rows in root table of destination dataset - diff --git a/docs/website/docs/api_reference/destinations/path_utils.md b/docs/website/docs/api_reference/destinations/path_utils.md deleted file mode 100644 index 35a5ff3faf..0000000000 --- a/docs/website/docs/api_reference/destinations/path_utils.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -sidebar_label: path_utils -title: destinations.path_utils ---- - -#### create\_path - -```python -def create_path(layout: str, schema_name: str, table_name: str, load_id: str, - file_id: str, ext: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/path_utils.py#L35) - -create a filepath from the layout and our default params - -#### get\_table\_prefix\_layout - -```python -def get_table_prefix_layout( - layout: str, - supported_prefix_placeholders: Sequence[ - str] = SUPPORTED_TABLE_NAME_PREFIX_PLACEHOLDERS -) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/path_utils.py#L52) - -get layout fragment that defines positions of the table, cutting other placeholders - -allowed `supported_prefix_placeholders` that may appear before table. - diff --git a/docs/website/docs/api_reference/destinations/postgres/configuration.md b/docs/website/docs/api_reference/destinations/postgres/configuration.md deleted file mode 100644 index 83980ff50c..0000000000 --- a/docs/website/docs/api_reference/destinations/postgres/configuration.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.postgres.configuration ---- - -## PostgresCredentials Objects - -```python -@configspec -class PostgresCredentials(ConnectionStringCredentials) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/postgres/configuration.py#L13) - -#### drivername - -type: ignore - -## PostgresClientConfiguration Objects - -```python -@configspec -class PostgresClientConfiguration(DestinationClientDwhWithStagingConfiguration - ) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/postgres/configuration.py#L38) - -#### destination\_name - -type: ignore - -#### fingerprint - -```python -def fingerprint() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/postgres/configuration.py#L44) - -Returns a fingerprint of host part of a connection string - diff --git a/docs/website/docs/api_reference/destinations/redshift/configuration.md b/docs/website/docs/api_reference/destinations/redshift/configuration.md deleted file mode 100644 index 88a0a7bdc3..0000000000 --- a/docs/website/docs/api_reference/destinations/redshift/configuration.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.redshift.configuration ---- - -## RedshiftClientConfiguration Objects - -```python -@configspec -class RedshiftClientConfiguration(PostgresClientConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/redshift/configuration.py#L19) - -#### destination\_name - -type: ignore - -#### fingerprint - -```python -def fingerprint() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/redshift/configuration.py#L24) - -Returns a fingerprint of host part of a connection string - diff --git a/docs/website/docs/api_reference/destinations/redshift/redshift.md b/docs/website/docs/api_reference/destinations/redshift/redshift.md deleted file mode 100644 index cacedd777f..0000000000 --- a/docs/website/docs/api_reference/destinations/redshift/redshift.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -sidebar_label: redshift -title: destinations.redshift.redshift ---- - -## RedshiftMergeJob Objects - -```python -class RedshiftMergeJob(SqlMergeJob) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/redshift/redshift.py#L146) - -#### gen\_key\_table\_clauses - -```python -@classmethod -def gen_key_table_clauses(cls, root_table_name: str, - staging_root_table_name: str, - key_clauses: Sequence[str], - for_delete: bool) -> List[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/redshift/redshift.py#L149) - -Generate sql clauses that may be used to select or delete rows in root table of destination dataset - -A list of clauses may be returned for engines that do not support OR in subqueries. Like BigQuery - -## RedshiftClient Objects - -```python -class RedshiftClient(InsertValuesJobClient) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/redshift/redshift.py#L159) - -#### start\_file\_load - -```python -def start_file_load(table: TTableSchema, file_path: str, - load_id: str) -> LoadJob -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/redshift/redshift.py#L180) - -Starts SqlLoadJob for files ending with .sql or returns None to let derived classes to handle their specific jobs - diff --git a/docs/website/docs/api_reference/destinations/snowflake/configuration.md b/docs/website/docs/api_reference/destinations/snowflake/configuration.md deleted file mode 100644 index 462c9e789a..0000000000 --- a/docs/website/docs/api_reference/destinations/snowflake/configuration.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.snowflake.configuration ---- - -## SnowflakeCredentials Objects - -```python -@configspec -class SnowflakeCredentials(ConnectionStringCredentials) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/snowflake/configuration.py#L37) - -#### drivername - -type: ignore[misc] - -## SnowflakeClientConfiguration Objects - -```python -@configspec -class SnowflakeClientConfiguration(DestinationClientDwhWithStagingConfiguration - ) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/snowflake/configuration.py#L87) - -#### destination\_name - -type: ignore[misc] - -#### stage\_name - -Use an existing named stage instead of the default. Default uses the implicit table stage per table - -#### keep\_staged\_files - -Whether to keep or delete the staged files after COPY INTO succeeds - -#### fingerprint - -```python -def fingerprint() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/snowflake/configuration.py#L96) - -Returns a fingerprint of host part of a connection string - diff --git a/docs/website/docs/api_reference/destinations/snowflake/sql_client.md b/docs/website/docs/api_reference/destinations/snowflake/sql_client.md deleted file mode 100644 index bb29847b32..0000000000 --- a/docs/website/docs/api_reference/destinations/snowflake/sql_client.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -sidebar_label: sql_client -title: destinations.snowflake.sql_client ---- - -## SnowflakeCursorImpl Objects - -```python -class SnowflakeCursorImpl(DBApiCursorImpl) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/snowflake/sql_client.py#L13) - -#### native\_cursor - -type: ignore[assignment] - diff --git a/docs/website/docs/api_reference/destinations/sql_client.md b/docs/website/docs/api_reference/destinations/sql_client.md deleted file mode 100644 index de1f3710e1..0000000000 --- a/docs/website/docs/api_reference/destinations/sql_client.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -sidebar_label: sql_client -title: destinations.sql_client ---- - -## SqlClientBase Objects - -```python -class SqlClientBase(ABC, Generic[TNativeConn]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_client.py#L15) - -#### execute\_fragments - -```python -def execute_fragments(fragments: Sequence[AnyStr], *args: Any, - **kwargs: Any) -> Optional[Sequence[Sequence[Any]]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_client.py#L92) - -Executes several SQL fragments as efficiently as possible to prevent data copying. Default implementation just joins the strings and executes them together. - -#### with\_alternative\_dataset\_name - -```python -@contextmanager -def with_alternative_dataset_name( - dataset_name: str) -> Iterator["SqlClientBase[TNativeConn]"] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_client.py#L112) - -Sets the `dataset_name` as the default dataset during the lifetime of the context. Does not modify any search paths in the existing connection. - -## DBApiCursorImpl Objects - -```python -class DBApiCursorImpl(DBApiCursor) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_client.py#L157) - -A DBApi Cursor wrapper with dataframes reading functionality - diff --git a/docs/website/docs/api_reference/destinations/sql_jobs.md b/docs/website/docs/api_reference/destinations/sql_jobs.md deleted file mode 100644 index 7e12654ec6..0000000000 --- a/docs/website/docs/api_reference/destinations/sql_jobs.md +++ /dev/null @@ -1,99 +0,0 @@ ---- -sidebar_label: sql_jobs -title: destinations.sql_jobs ---- - -## SqlBaseJob Objects - -```python -class SqlBaseJob(NewLoadJobImpl) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_jobs.py#L15) - -Sql base job for jobs that rely on the whole tablechain - -#### from\_table\_chain - -```python -@classmethod -def from_table_chain(cls, table_chain: Sequence[TTableSchema], - sql_client: SqlClientBase[Any]) -> NewLoadJobImpl -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_jobs.py#L20) - -Generates a list of sql statements, that will be executed by the sql client when the job is executed in the loader. - -The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). - -## SqlStagingCopyJob Objects - -```python -class SqlStagingCopyJob(SqlBaseJob) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_jobs.py#L46) - -Generates a list of sql statements that copy the data from staging dataset into destination dataset. - -## SqlMergeJob Objects - -```python -class SqlMergeJob(SqlBaseJob) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_jobs.py#L62) - -Generates a list of sql statements that merge the data from staging dataset into destination dataset. - -#### generate\_sql - -```python -@classmethod -def generate_sql(cls, table_chain: Sequence[TTableSchema], - sql_client: SqlClientBase[Any]) -> List[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_jobs.py#L67) - -Generates a list of sql statements that merge the data in staging dataset with the data in destination dataset. - -The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). -The root table is merged using primary_key and merge_key hints which can be compound and be both specified. In that case the OR clause is generated. -The child tables are merged based on propagated `root_key` which is a type of foreign key but always leading to a root table. - -First we store the root_keys of root table elements to be deleted in the temp table. Then we use the temp table to delete records from root and all child tables in the destination dataset. -At the end we copy the data from the staging dataset into destination dataset. - -#### gen\_key\_table\_clauses - -```python -@classmethod -def gen_key_table_clauses(cls, root_table_name: str, - staging_root_table_name: str, - key_clauses: Sequence[str], - for_delete: bool) -> List[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_jobs.py#L91) - -Generate sql clauses that may be used to select or delete rows in root table of destination dataset - -A list of clauses may be returned for engines that do not support OR in subqueries. Like BigQuery - -#### gen\_delete\_temp\_table\_sql - -```python -@classmethod -def gen_delete_temp_table_sql( - cls, unique_column: str, - key_table_clauses: Sequence[str]) -> Tuple[List[str], str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/sql_jobs.py#L99) - -Generate sql that creates delete temp table and inserts `unique_column` from root table for all records to delete. May return several statements. - -Returns temp table name for cases where special names are required like SQLServer. - diff --git a/docs/website/docs/api_reference/destinations/typing.md b/docs/website/docs/api_reference/destinations/typing.md deleted file mode 100644 index 0eaf0bc45a..0000000000 --- a/docs/website/docs/api_reference/destinations/typing.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -sidebar_label: typing -title: destinations.typing ---- - -## DBApiCursor Objects - -```python -class DBApiCursor(Protocol) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/typing.py#L24) - -Protocol for DBAPI cursor - -#### native\_cursor - -Cursor implementation native to current destination - -#### df - -```python -def df(chunk_size: int = None, **kwargs: None) -> Optional[DataFrame] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/typing.py#L42) - -Fetches the results as data frame. For large queries the results may be chunked - -Fetches the results into a data frame. The default implementation uses helpers in `pandas.io.sql` to generate Pandas data frame. -This function will try to use native data frame generation for particular destination. For `BigQuery`: `QueryJob.to_dataframe` is used. -For `duckdb`: `DuckDBPyConnection.df' - -**Arguments**: - -- `chunk_size` _int, optional_ - Will chunk the results into several data frames. Defaults to None -- `**kwargs` _Any_ - Additional parameters which will be passed to native data frame generation function. - - -**Returns**: - -- `Optional[DataFrame]` - A data frame with query results. If chunk_size > 0, None will be returned if there is no more data in results - diff --git a/docs/website/docs/api_reference/destinations/weaviate/configuration.md b/docs/website/docs/api_reference/destinations/weaviate/configuration.md deleted file mode 100644 index cd27e2903b..0000000000 --- a/docs/website/docs/api_reference/destinations/weaviate/configuration.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -sidebar_label: configuration -title: destinations.weaviate.configuration ---- - -## WeaviateCredentials Objects - -```python -@configspec -class WeaviateCredentials(CredentialsConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/configuration.py#L14) - -#### \_\_str\_\_ - -```python -def __str__() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/configuration.py#L19) - -Used to display user friendly data location - -## WeaviateClientConfiguration Objects - -```python -@configspec -class WeaviateClientConfiguration(DestinationClientDwhConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/configuration.py#L26) - -#### destination\_name - -type: ignore - -#### dataset\_name - -type: ignore - -#### fingerprint - -```python -def fingerprint() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/configuration.py#L52) - -Returns a fingerprint of host part of a connection string - diff --git a/docs/website/docs/api_reference/destinations/weaviate/naming.md b/docs/website/docs/api_reference/destinations/weaviate/naming.md deleted file mode 100644 index dafd6fe693..0000000000 --- a/docs/website/docs/api_reference/destinations/weaviate/naming.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -sidebar_label: naming -title: destinations.weaviate.naming ---- - -## NamingConvention Objects - -```python -class NamingConvention(SnakeCaseNamingConvention) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/naming.py#L7) - -Normalizes identifiers according to Weaviate documentation: https://weaviate.io/developers/weaviate/config-refs/schema#class - -#### normalize\_identifier - -```python -def normalize_identifier(identifier: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/naming.py#L20) - -Normalizes Weaviate property name by removing not allowed characters, replacing them by _ and contracting multiple _ into single one -and lowercasing the first character. - -#### normalize\_table\_identifier - -```python -def normalize_table_identifier(identifier: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/naming.py#L34) - -Creates Weaviate class name. Runs property normalization and then creates capitalized case name by splitting on _ - -https://weaviate.io/developers/weaviate/configuration/schema-configuration#create-a-class - diff --git a/docs/website/docs/api_reference/destinations/weaviate/weaviate_adapter.md b/docs/website/docs/api_reference/destinations/weaviate/weaviate_adapter.md deleted file mode 100644 index 130a3835c6..0000000000 --- a/docs/website/docs/api_reference/destinations/weaviate/weaviate_adapter.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -sidebar_label: weaviate_adapter -title: destinations.weaviate.weaviate_adapter ---- - -#### TTokenizationSetting - -Maps column names to tokenization types supported by Weaviate - -#### weaviate\_adapter - -```python -def weaviate_adapter(data: Any, - vectorize: TColumnNames = None, - tokenization: TTokenizationSetting = None) -> DltResource -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_adapter.py#L16) - -Prepares data for the Weaviate destination by specifying which columns -should be vectorized and which tokenization method to use. - -Vectorization is done by Weaviate's vectorizer modules. The vectorizer module -can be configured in dlt configuration file under -`[destination.weaviate.vectorizer]` and `[destination.weaviate.module_config]`. -The default vectorizer module is `text2vec-openai`. See also: -https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules - -**Arguments**: - -- `data` _Any_ - The data to be transformed. It can be raw data or an instance - of DltResource. If raw data, the function wraps it into a DltResource - object. -- `vectorize` _TColumnNames, optional_ - Specifies columns that should be - vectorized. Can be a single column name as a string or a list of - column names. -- `tokenization` _TTokenizationSetting, optional_ - A dictionary mapping column - names to tokenization methods supported by Weaviate. The tokenization - methods are one of the values in `TOKENIZATION_METHODS`: - - 'word', - - 'lowercase', - - 'whitespace', - - 'field'. - - -**Returns**: - -- `DltResource` - A resource with applied Weaviate-specific hints. - - -**Raises**: - -- `ValueError` - If input for `vectorize` or `tokenization` is invalid - or neither is specified. - - -**Examples**: - - >>> data = [{"name": "Alice", "description": "Software developer"}] - >>> weaviate_adapter(data, vectorize="description", tokenization={"description": "word"}) - [DltResource with hints applied] - diff --git a/docs/website/docs/api_reference/destinations/weaviate/weaviate_client.md b/docs/website/docs/api_reference/destinations/weaviate/weaviate_client.md deleted file mode 100644 index 306cdf153f..0000000000 --- a/docs/website/docs/api_reference/destinations/weaviate/weaviate_client.md +++ /dev/null @@ -1,195 +0,0 @@ ---- -sidebar_label: weaviate_client -title: destinations.weaviate.weaviate_client ---- - -## LoadWeaviateJob Objects - -```python -class LoadWeaviateJob(LoadJob) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L139) - -#### load\_batch - -```python -@wrap_weaviate_error -def load_batch(f: IO[str]) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L170) - -Load all the lines from stream `f` in automatic Weaviate batches. -Weaviate batch supports retries so we do not need to do that. - -## WeaviateClient Objects - -```python -class WeaviateClient(JobClientBase, WithStateSync) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L235) - -Weaviate client implementation. - -#### make\_qualified\_class\_name - -```python -def make_qualified_class_name(table_name: str) -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L271) - -Make a full Weaviate class name from a table name by prepending -the dataset name if it exists. - -#### get\_class\_schema - -```python -def get_class_schema(table_name: str) -> Dict[str, Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L283) - -Get the Weaviate class schema for a table. - -#### create\_class - -```python -def create_class(class_schema: Dict[str, Any], - full_class_name: Optional[str] = None) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L289) - -Create a Weaviate class. - -**Arguments**: - -- `class_schema` - The class schema to create. -- `full_class_name` - The full name of the class to create. If not - provided, the class name will be prepended with the dataset name - if it exists. - -#### create\_class\_property - -```python -def create_class_property(class_name: str, prop_schema: Dict[str, - Any]) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L310) - -Create a Weaviate class property. - -**Arguments**: - -- `class_name` - The name of the class to create the property on. -- `prop_schema` - The property schema to create. - -#### delete\_class - -```python -def delete_class(class_name: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L323) - -Delete a Weaviate class. - -**Arguments**: - -- `class_name` - The name of the class to delete. - -#### delete\_all\_classes - -```python -def delete_all_classes() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L331) - -Delete all Weaviate classes from Weaviate instance and all data -associated with it. - -#### query\_class - -```python -def query_class(class_name: str, properties: List[str]) -> GetBuilder -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L337) - -Query a Weaviate class. - -**Arguments**: - -- `class_name` - The name of the class to query. -- `properties` - The properties to return. - - -**Returns**: - - A Weaviate query builder. - -#### create\_object - -```python -def create_object(obj: Dict[str, Any], class_name: str) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L349) - -Create a Weaviate object. - -**Arguments**: - -- `obj` - The object to create. -- `class_name` - The name of the class to create the object on. - -#### drop\_storage - -```python -def drop_storage() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L358) - -Drop the dataset from Weaviate instance. - -Deletes all classes in the dataset and all data associated with them. -Deletes the sentinel class as well. - -If dataset name was not provided, it deletes all the tables in the current schema - -#### get\_stored\_state - -```python -def get_stored_state(pipeline_name: str) -> Optional[StateInfo] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L481) - -Loads compressed state from destination storage - -#### get\_stored\_schema - -```python -def get_stored_schema() -> Optional[StorageSchemaInfo] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L525) - -Retrieves newest schema from destination storage - -#### make\_weaviate\_class\_schema - -```python -def make_weaviate_class_schema(table_name: str) -> Dict[str, Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/destinations/weaviate/weaviate_client.py#L576) - -Creates a Weaviate class schema from a table schema. - diff --git a/docs/website/docs/api_reference/extract/decorators.md b/docs/website/docs/api_reference/extract/decorators.md deleted file mode 100644 index 0134d7e4ad..0000000000 --- a/docs/website/docs/api_reference/extract/decorators.md +++ /dev/null @@ -1,224 +0,0 @@ ---- -sidebar_label: decorators -title: extract.decorators ---- - -## SourceSchemaInjectableContext Objects - -```python -@configspec -class SourceSchemaInjectableContext(ContainerInjectableContext) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/decorators.py#L32) - -A context containing the source schema, present when decorated function is executed - -#### source - -```python -def source(func: Optional[AnyFun] = None, - name: str = None, - section: str = None, - max_table_nesting: int = None, - root_key: bool = False, - schema: Schema = None, - spec: Type[BaseConfiguration] = None) -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/decorators.py#L72) - -A decorator that transforms a function returning one or more `dlt resources` into a `dlt source` in order to load it with `dlt`. - -### Summary -A `dlt source` is a logical grouping of resources that are often extracted and loaded together. A source is associated with a schema, which describes the structure of the loaded data and provides instructions how to load it. -Such schema contains table schemas that describe the structure of the data coming from the resources. - -Please refer to https://dlthub.com/docs/general-usage/source for a complete documentation. - -### Passing credentials -Another important function of the source decorator is to provide credentials and other configuration to the code that extracts data. The decorator may automatically bind the source function arguments to the secret and config values. ->>> @dlt.source ->>> def chess(username, chess_url: str = dlt.config.value, api_secret = dlt.secrets.value, title: str = "GM"): ->>> return user_profile(username, chess_url, api_secret), user_games(username, chess_url, api_secret, with_titles=title) ->>> ->>> list(chess("magnuscarlsen")) - -Here `username` is a required, explicit python argument, `chess_url` is a required argument, that if not explicitly passed will be taken from configuration ie. `config.toml`, `api_secret` is a required argument, that if not explicitly passed will be taken from dlt secrets ie. `secrets.toml`. -See https://dlthub.com/docs/general-usage/credentials for details. - -### Args: -func: A function that returns a dlt resource or a list of those or a list of any data items that can be loaded by `dlt`. - -name (str, optional): A name of the source which is also the name of the associated schema. If not present, the function name will be used. - -section (str, optional): A name of configuration. If not present, the current python module name will be used. - -max_table_nesting (int, optional): A schema hint that sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. - -root_key (bool): Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge. Defaults to False. - -schema (Schema, optional): An explicit `Schema` instance to be associated with the source. If not present, `dlt` creates a new `Schema` object with provided `name`. If such `Schema` already exists in the same folder as the module containing the decorated function, such schema will be loaded from file. - -spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source. - -**Returns**: - - `DltSource` instance - -#### resource - -```python -def resource(data: Optional[Any] = None, - name: str = None, - table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, - columns: TTableHintTemplate[TAnySchemaColumns] = None, - primary_key: TTableHintTemplate[TColumnNames] = None, - merge_key: TTableHintTemplate[TColumnNames] = None, - selected: bool = True, - spec: Type[BaseConfiguration] = None, - depends_on: TUnboundDltResource = None) -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/decorators.py#L239) - -When used as a decorator, transforms any generator (yielding) function into a `dlt resource`. When used as a function, it transforms data in `data` argument into a `dlt resource`. - -### Summary -A `resource`is a location within a `source` that holds the data with specific structure (schema) or coming from specific origin. A resource may be a rest API endpoint, table in the database or a tab in Google Sheets. -A `dlt resource` is python representation of a `resource` that combines both data and metadata (table schema) that describes the structure and instructs the loading of the data. -A `dlt resource` is also an `Iterable` and can used like any other iterable object ie. list or tuple. - -Please refer to https://dlthub.com/docs/general-usage/resource for a complete documentation. - -### Passing credentials -If used as a decorator (`data` argument is a `Generator`), it may automatically bind the source function arguments to the secret and config values. ->>> @dlt.resource ->>> def user_games(username, chess_url: str = dlt.config.value, api_secret = dlt.secrets.value): ->>> return requests.get("%s/games/%s" % (chess_url, username), headers={"Authorization": f"Bearer {api_secret}"}) ->>> ->>> list(user_games("magnuscarlsen")) - -Here `username` is a required, explicit python argument, `chess_url` is a required argument, that if not explicitly passed will be taken from configuration ie. `config.toml`, `api_secret` is a required argument, that if not explicitly passed will be taken from dlt secrets ie. `secrets.toml`. -See https://dlthub.com/docs/general-usage/credentials for details. -Note that if decorated function is an inner function, passing of the credentials will be disabled. - -### Args: -data (Callable | Any, optional): a function to be decorated or a data compatible with `dlt` `run`. - -name (str, optional): A name of the resource that by default also becomes the name of the table to which the data is loaded. -If not present, the name of the decorated function will be used. - -table_name (TTableHintTemplate[str], optional): An table name, if different from `name`. -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -write_disposition (Literal["skip", "append", "replace", "merge"], optional): Controls how to write data to a table. `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -columns (Sequence[TAnySchemaColumns], optional): A list, dict or pydantic model of column schemas. Typed dictionary describing column names, data types, write disposition and performance hints that gives you full control over the created table schema. -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -primary_key (str | Sequence[str]): A column name or a list of column names that comprise a private key. Typically used with "merge" write disposition to deduplicate loaded data. -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -merge_key (str | Sequence[str]): A column name or a list of column names that define a merge key. Typically used with "merge" write disposition to remove overlapping data ranges ie. to keep a single record for a given day. -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -selected (bool, optional): When `True` `dlt pipeline` will extract and load this resource, if `False`, the resource will be ignored. - -spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source. - -depends_on (TUnboundDltResource, optional): Allows to pipe data from one resource to another to build multi-step pipelines. - -### Raises -ResourceNameMissing: indicates that name of the resource cannot be inferred from the `data` being passed. -InvalidResourceDataType: indicates that the `data` argument cannot be converted into `dlt resource` - -**Returns**: - - DltResource instance which may be loaded, iterated or combined with other resources into a pipeline. - -#### transformer - -```python -def transformer( - f: Optional[Callable[Concatenate[TDataItem, TResourceFunParams], - Any]] = None, - data_from: TUnboundDltResource = DltResource.Empty, - name: str = None, - table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, - columns: TTableHintTemplate[TAnySchemaColumns] = None, - primary_key: TTableHintTemplate[TColumnNames] = None, - merge_key: TTableHintTemplate[TColumnNames] = None, - selected: bool = True, - spec: Type[BaseConfiguration] = None -) -> Callable[[Callable[Concatenate[TDataItem, TResourceFunParams], Any]], - Callable[TResourceFunParams, DltResource]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/decorators.py#L409) - -A form of `dlt resource` that takes input from other resources via `data_from` argument in order to enrich or transform the data. - -The decorated function `f` must take at least one argument of type TDataItems (a single item or list of items depending on the resource `data_from`). `dlt` will pass -metadata associated with the data item if argument with name `meta` is present. Otherwise, transformer function may take more arguments and be parametrized -like the resources. - -You can bind the transformer early by specifying resource in `data_from` when the transformer is created or create dynamic bindings later with | operator -which is demonstrated in example below: - -### Example ->>> @dlt.resource ->>> def players(title, chess_url=dlt.config.value): ->>> r = requests.get(f"{chess_url}titled/{title}") ->>> yield r.json()["players"] # returns list of player names ->>> ->>> # this resource takes data from players and returns profiles ->>> @dlt.transformer(write_disposition="replace") ->>> def player_profile(player: Any) -> Iterator[TDataItems]: ->>> r = requests.get(f"{chess_url}player/{player}") ->>> r.raise_for_status() ->>> yield r.json() ->>> ->>> # pipes the data from players into player profile to produce a list of player profiles ->>> list(players("GM") | player_profile) - -### Args: -f: (Callable): a function taking minimum one argument of TDataItems type which will receive data yielded from `data_from` resource. - -data_from (Callable | Any, optional): a resource that will send data to the decorated function `f` - -name (str, optional): A name of the resource that by default also becomes the name of the table to which the data is loaded. -If not present, the name of the decorated function will be used. - -table_name (TTableHintTemplate[str], optional): An table name, if different from `name`. -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -write_disposition (Literal["skip", "append", "replace", "merge"], optional): Controls how to write data to a table. `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -columns (Sequence[TAnySchemaColumns], optional): A list, dict or pydantic model of column schemas. Typed dictionary describing column names, data types, write disposition and performance hints that gives you full control over the created table schema. -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -primary_key (str | Sequence[str]): A column name or a list of column names that comprise a private key. Typically used with "merge" write disposition to deduplicate loaded data. -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -merge_key (str | Sequence[str]): A column name or a list of column names that define a merge key. Typically used with "merge" write disposition to remove overlapping data ranges ie. to keep a single record for a given day. -This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - -selected (bool, optional): When `True` `dlt pipeline` will extract and load this resource, if `False`, the resource will be ignored. - -spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source. - -#### get\_source\_schema - -```python -def get_source_schema() -> Schema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/decorators.py#L513) - -When executed from the function decorated with @dlt.source, returns a writable source Schema - diff --git a/docs/website/docs/api_reference/extract/incremental.md b/docs/website/docs/api_reference/extract/incremental.md deleted file mode 100644 index cd97fff39c..0000000000 --- a/docs/website/docs/api_reference/extract/incremental.md +++ /dev/null @@ -1,161 +0,0 @@ ---- -sidebar_label: incremental -title: extract.incremental ---- - -## Incremental Objects - -```python -@configspec -class Incremental(FilterItem, BaseConfiguration, Generic[TCursorValue]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L52) - -Adds incremental extraction for a resource by storing a cursor value in persistent state. - -The cursor could for example be a timestamp for when the record was created and you can use this to load only -new records created since the last run of the pipeline. - -To use this the resource function should have an argument either type annotated with `Incremental` or a default `Incremental` instance. -For example: - ->>> @dlt.resource(primary_key='id') ->>> def some_data(created_at=dlt.sources.incremental('created_at', '2023-01-01T00:00:00Z'): ->>> yield from request_data(created_after=created_at.last_value) - -When the resource has a `primary_key` specified this is used to deduplicate overlapping items with the same cursor value. - -Alternatively you can use this class as transform step and add it to any resource. For example: ->>> @dlt.resource ->>> def some_data(): ->>> last_value = dlt.sources.incremental.from_existing_state("some_data", "item.ts") ->>> ... ->>> ->>> r = some_data().add_step(dlt.sources.incremental("item.ts", initial_value=now, primary_key="delta")) ->>> info = p.run(r, destination="duckdb") - -**Arguments**: - -- `cursor_path` - The name or a JSON path to an cursor field. Uses the same names of fields as in your JSON document, before they are normalized to store in the database. -- `initial_value` - Optional value used for `last_value` when no state is available, e.g. on the first run of the pipeline. If not provided `last_value` will be `None` on the first run. -- `last_value_func` - Callable used to determine which cursor value to save in state. It is called with a list of the stored state value and all cursor vals from currently processing items. Default is `max` -- `primary_key` - Optional primary key used to deduplicate data. If not provided, a primary key defined by the resource will be used. Pass a tuple to define a compound key. Pass empty tuple to disable unique checks -- `end_value` - Optional value used to load a limited range of records between `initial_value` and `end_value`. - Use in conjunction with `initial_value`, e.g. load records from given month `incremental(initial_value="2022-01-01T00:00:00Z", end_value="2022-02-01T00:00:00Z")` - Note, when this is set the incremental filtering is stateless and `initial_value` always supersedes any previous incremental value in state. -- `allow_external_schedulers` - If set to True, allows dlt to look for external schedulers from which it will take "initial_value" and "end_value" resulting in loading only - specified range of data. Currently Airflow scheduler is detected: "data_interval_start" and "data_interval_end" are taken from the context and passed Incremental class. - The values passed explicitly to Incremental will be ignored. - Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded - -#### from\_existing\_state - -```python -@classmethod -def from_existing_state(cls, resource_name: str, - cursor_path: str) -> "Incremental[TCursorValue]" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L126) - -Create Incremental instance from existing state. - -#### merge - -```python -def merge(other: "Incremental[TCursorValue]") -> "Incremental[TCursorValue]" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L145) - -Create a new incremental instance which merges the two instances. -Only properties which are not `None` from `other` override the current instance properties. - -This supports use cases with partial overrides, such as: ->>> def my_resource(updated=incremental('updated', initial_value='1970-01-01')) ->>> ... ->>> ->>> my_resource(updated=incremental(initial_value='2023-01-01', end_value='2023-02-01')) - -#### get\_state - -```python -def get_state() -> IncrementalColumnState -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L200) - -Returns an Incremental state for a particular cursor column - -#### get\_incremental\_value\_type - -```python -def get_incremental_value_type() -> Type[Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L304) - -Infers the type of incremental value from a class of an instance if those preserve the Generic arguments information. - -#### bind - -```python -def bind(pipe: SupportsPipe) -> "Incremental[TCursorValue]" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L360) - -Called by pipe just before evaluation - -## IncrementalResourceWrapper Objects - -```python -class IncrementalResourceWrapper(FilterItem) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L380) - -#### \_\_init\_\_ - -```python -def __init__( - resource_name: str, - primary_key: Optional[TTableHintTemplate[TColumnNames]] = None -) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L384) - -Creates a wrapper over a resource function that accepts Incremental instance in its argument to perform incremental loading. - -The wrapper delays instantiation of the Incremental to the moment of actual execution and is currently used by `dlt.resource` decorator. -The wrapper explicitly (via `resource_name`) parameter binds the Incremental state to a resource state. -Note that wrapper implements `FilterItem` transform interface and functions as a processing step in the before-mentioned resource pipe. - -**Arguments**: - -- `resource_name` _str_ - A name of resource to which the Incremental will be bound at execution -- `primary_key` _TTableHintTemplate[TColumnKey], optional_ - A primary key to be passed to Incremental Instance at execution. Defaults to None. - -#### wrap - -```python -def wrap(sig: inspect.Signature, func: TFun) -> TFun -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L415) - -Wrap the callable to inject an `Incremental` object configured for the resource. - -#### allow\_external\_schedulers - -```python -@property -def allow_external_schedulers() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/incremental.py#L466) - -Allows the Incremental instance to get its initial and end values from external schedulers like Airflow - diff --git a/docs/website/docs/api_reference/extract/pipe.md b/docs/website/docs/api_reference/extract/pipe.md deleted file mode 100644 index e1de6655fa..0000000000 --- a/docs/website/docs/api_reference/extract/pipe.md +++ /dev/null @@ -1,204 +0,0 @@ ---- -sidebar_label: pipe -title: extract.pipe ---- - -## ForkPipe Objects - -```python -class ForkPipe() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L71) - -#### \_\_init\_\_ - -```python -def __init__(pipe: "Pipe", step: int = -1, copy_on_fork: bool = False) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L72) - -A transformer that forks the `pipe` and sends the data items to forks added via `add_pipe` method. - -## Pipe Objects - -```python -class Pipe(SupportsPipe) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L97) - -#### is\_empty - -```python -@property -def is_empty() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L114) - -Checks if pipe contains any steps - -#### has\_parent - -```python -@property -def has_parent() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L119) - -Checks if pipe is connected to parent pipe from which it takes data items. Connected pipes are created from transformer resources - -#### is\_data\_bound - -```python -@property -def is_data_bound() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L124) - -Checks if pipe is bound to data and can be iterated. Pipe is bound if has a parent that is bound xor is not empty. - -#### gen - -```python -@property -def gen() -> TPipeStep -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L132) - -A data generating step - -#### find - -```python -def find(*step_type: AnyType) -> int -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L144) - -Finds a step with object of type `step_type` - -#### append\_step - -```python -def append_step(step: TPipeStep) -> "Pipe" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L166) - -Appends pipeline step. On first added step performs additional verification if step is a valid data generator - -#### insert\_step - -```python -def insert_step(step: TPipeStep, index: int) -> "Pipe" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L177) - -Inserts step at a given index in the pipeline. Allows prepending only for transformers - -#### remove\_step - -```python -def remove_step(index: int) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L193) - -Removes steps at a given index. Gen step cannot be removed - -#### replace\_gen - -```python -def replace_gen(gen: TPipeStep) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L201) - -Replaces data generating step. Assumes that you know what are you doing - -#### full\_pipe - -```python -def full_pipe() -> "Pipe" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L206) - -Creates a pipe that from the current and all the parent pipes. - -#### ensure\_gen\_bound - -```python -def ensure_gen_bound() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L226) - -Verifies that gen step is bound to data - -#### evaluate\_gen - -```python -def evaluate_gen() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L239) - -Lazily evaluate gen of the pipe when creating PipeIterator. Allows creating multiple use pipes from generator functions and lists - -#### bind\_gen - -```python -def bind_gen(*args: Any, **kwargs: Any) -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L265) - -Finds and wraps with `args` + `kwargs` the callable generating step in the resource pipe and then replaces the pipe gen with the wrapped one - -## PipeIterator Objects - -```python -class PipeIterator(Iterator[PipeItem]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L421) - -#### clone\_pipes - -```python -@staticmethod -def clone_pipes(pipes: Sequence[Pipe]) -> List[Pipe] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L771) - -This will clone pipes and fix the parent/dependent references - -## ManagedPipeIterator Objects - -```python -class ManagedPipeIterator(PipeIterator) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L796) - -A version of the pipe iterator that gets closed automatically on an exception in _next_ - -#### set\_context - -```python -def set_context(ctx: List[ContainerInjectableContext]) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/pipe.py#L801) - -Sets list of injectable contexts that will be injected into Container for each call to __next__ - diff --git a/docs/website/docs/api_reference/extract/schema.md b/docs/website/docs/api_reference/extract/schema.md deleted file mode 100644 index a2444336d8..0000000000 --- a/docs/website/docs/api_reference/extract/schema.md +++ /dev/null @@ -1,72 +0,0 @@ ---- -sidebar_label: schema -title: extract.schema ---- - -## DltResourceSchema Objects - -```python -class DltResourceSchema() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/schema.py#L29) - -#### table\_name - -```python -@property -def table_name() -> TTableHintTemplate[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/schema.py#L39) - -Get table name to which resource loads data. May return a callable. - -#### columns - -```python -@property -def columns() -> TTableHintTemplate[TTableSchemaColumns] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/schema.py#L60) - -Gets columns schema that can be modified in place - -#### compute\_table\_schema - -```python -def compute_table_schema(item: TDataItem = None) -> TPartialTableSchema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/schema.py#L66) - -Computes the table schema based on hints and column definitions passed during resource creation. `item` parameter is used to resolve table hints based on data - -#### apply\_hints - -```python -def apply_hints( - table_name: TTableHintTemplate[str] = None, - parent_table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, - columns: TTableHintTemplate[TAnySchemaColumns] = None, - primary_key: TTableHintTemplate[TColumnNames] = None, - merge_key: TTableHintTemplate[TColumnNames] = None, - incremental: Incremental[Any] = None) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/schema.py#L90) - -Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. - -This method accepts the same table hints arguments as `dlt.resource` decorator with the following additions. -Skip the argument or pass None to leave the existing hint. -Pass empty value (for particular type ie "" for a string) to remove hint - -parent_table_name (str, optional): A name of parent table if foreign relation is defined. Please note that if you use merge you must define `root_key` columns explicitly -incremental (Incremental, optional): Enables the incremental loading for a resource. - -Please note that for efficient incremental loading, the resource must be aware of the Incremental by accepting it as one if its arguments and then using is to skip already loaded data. -In non-aware resources, `dlt` will filter out the loaded values, however the resource will yield all the values again. - diff --git a/docs/website/docs/api_reference/extract/source.md b/docs/website/docs/api_reference/extract/source.md deleted file mode 100644 index f2b9ceee76..0000000000 --- a/docs/website/docs/api_reference/extract/source.md +++ /dev/null @@ -1,487 +0,0 @@ ---- -sidebar_label: source -title: extract.source ---- - -#### with\_table\_name - -```python -def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L30) - -Marks `item` to be dispatched to table `table_name` when yielded from resource function. - -## DltResource Objects - -```python -class DltResource(Iterable[TDataItem], DltResourceSchema) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L35) - -#### source\_name - -Name of the source that contains this instance of the source, set when added to DltResourcesDict - -#### name - -```python -@property -def name() -> str -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L111) - -Resource name inherited from the pipe - -#### is\_transformer - -```python -@property -def is_transformer() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L116) - -Checks if the resource is a transformer that takes data from another resource - -#### requires\_binding - -```python -@property -def requires_binding() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L121) - -Checks if resource has unbound parameters - -#### incremental - -```python -@property -def incremental() -> IncrementalResourceWrapper -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L130) - -Gets incremental transform if it is in the pipe - -#### pipe\_data\_from - -```python -def pipe_data_from(data_from: Union["DltResource", Pipe]) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L138) - -Replaces the parent in the transformer resource pipe from which the data is piped. - -#### add\_pipe - -```python -def add_pipe(data: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L147) - -Creates additional pipe for the resource from the specified data - -#### select\_tables - -```python -def select_tables(*table_names: Iterable[str]) -> "DltResource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L152) - -For resources that dynamically dispatch data to several tables allows to select tables that will receive data, effectively filtering out other data items. - -Both `with_table_name` marker and data-based (function) table name hints are supported. - -#### add\_map - -```python -def add_map(item_map: ItemTransformFunc[TDataItem], - insert_at: int = None) -> "DltResource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L166) - -Adds mapping function defined in `item_map` to the resource pipe at position `inserted_at` - -`item_map` receives single data items, `dlt` will enumerate any lists of data items automatically - -**Arguments**: - -- `item_map` _ItemTransformFunc[TDataItem]_ - A function taking a single data item and optional meta argument. Returns transformed data item. -- `insert_at` _int, optional_ - At which step in pipe to insert the mapping. Defaults to None which inserts after last step - - -**Returns**: - -- `"DltResource"` - returns self - -#### add\_yield\_map - -```python -def add_yield_map(item_map: ItemTransformFunc[Iterator[TDataItem]], - insert_at: int = None) -> "DltResource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L184) - -Adds generating function defined in `item_map` to the resource pipe at position `inserted_at` - -`item_map` receives single data items, `dlt` will enumerate any lists of data items automatically. It may yield 0 or more data items and be used to -ie. pivot an item into sequence of rows. - -**Arguments**: - -- `item_map` _ItemTransformFunc[Iterator[TDataItem]]_ - A function taking a single data item and optional meta argument. Yields 0 or more data items. -- `insert_at` _int, optional_ - At which step in pipe to insert the generator. Defaults to None which inserts after last step - - -**Returns**: - -- `"DltResource"` - returns self - -#### add\_filter - -```python -def add_filter(item_filter: ItemTransformFunc[bool], - insert_at: int = None) -> "DltResource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L203) - -Adds filter defined in `item_filter` to the resource pipe at position `inserted_at` - -`item_filter` receives single data items, `dlt` will enumerate any lists of data items automatically - -**Arguments**: - -- `item_filter` _ItemTransformFunc[bool]_ - A function taking a single data item and optional meta argument. Returns bool. If True, item is kept -- `insert_at` _int, optional_ - At which step in pipe to insert the filter. Defaults to None which inserts after last step - -**Returns**: - -- `"DltResource"` - returns self - -#### add\_limit - -```python -def add_limit(max_items: int) -> "DltResource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L220) - -Adds a limit `max_items` to the resource pipe - -This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging. It is -a no-op for transformers. Those should be limited by their input data. - -**Arguments**: - -- `max_items` _int_ - The maximum number of items to yield - -**Returns**: - -- `"DltResource"` - returns self - -#### bind - -```python -def bind(*args: Any, **kwargs: Any) -> "DltResource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L276) - -Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators. - -#### state - -```python -@property -def state() -> StrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L301) - -Gets resource-scoped state from the active pipeline. PipelineStateNotAvailable is raised if pipeline context is not available - -#### clone - -```python -def clone(clone_pipe: bool = True, keep_pipe_id: bool = True) -> "DltResource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L306) - -Creates a deep copy of a current resource, optionally cloning also pipe. Note that name of a containing source will not be cloned. - -#### \_\_call\_\_ - -```python -def __call__(*args: Any, **kwargs: Any) -> "DltResource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L314) - -Binds the parametrized resources to passed arguments. Creates and returns a bound resource. Generators and iterators are not evaluated. - -#### \_\_or\_\_ - -```python -def __or__(transform: Union["DltResource", AnyFun]) -> "DltResource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L321) - -Allows to pipe data from across resources and transform functions with | operator - -#### \_\_iter\_\_ - -```python -def __iter__() -> Iterator[TDataItem] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L335) - -Opens iterator that yields the data items from the resources in the same order as in Pipeline class. - -A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. - -## DltResourceDict Objects - -```python -class DltResourceDict(Dict[str, DltResource]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L445) - -#### selected - -```python -@property -def selected() -> Dict[str, DltResource] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L454) - -Returns a subset of all resources that will be extracted and loaded to the destination. - -#### extracted - -```python -@property -def extracted() -> Dict[str, DltResource] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L459) - -Returns a dictionary of all resources that will be extracted. That includes selected resources and all their parents. -For parents that are not added explicitly to the source, a mock resource object is created that holds the parent pipe and derives the table -schema from the child resource - -#### selected\_dag - -```python -@property -def selected_dag() -> List[Tuple[str, str]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L484) - -Returns a list of edges of directed acyclic graph of pipes and their parents in selected resources - -## DltSource Objects - -```python -class DltSource(Iterable[TDataItem]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L553) - -Groups several `dlt resources` under a single schema and allows to perform operations on them. - -### Summary -The instance of this class is created whenever you call the `dlt.source` decorated function. It automates several functions for you: -* You can pass this instance to `dlt` `run` method in order to load all data present in the `dlt resources`. -* You can select and deselect resources that you want to load via `with_resources` method -* You can access the resources (which are `DltResource` instances) as source attributes -* It implements `Iterable` interface so you can get all the data from the resources yourself and without dlt pipeline present. -* You can get the `schema` for the source and all the resources within it. -* You can use a `run` method to load the data with a default instance of dlt pipeline. -* You can get source read only state for the currently active Pipeline instance - -#### from\_data - -```python -@classmethod -def from_data(cls, name: str, section: str, schema: Schema, - data: Any) -> "DltSource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L583) - -Converts any `data` supported by `dlt` `run` method into `dlt source` with a name `section`.`name` and `schema` schema. - -#### max\_table\_nesting - -```python -@property -def max_table_nesting() -> int -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L600) - -A schema hint that sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. - -#### exhausted - -```python -@property -def exhausted() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L609) - -check all selected pipes wether one of them has started. if so, the source is exhausted. - -#### root\_key - -```python -@property -def root_key() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L619) - -Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge - -#### resources - -```python -@property -def resources() -> DltResourceDict -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L640) - -A dictionary of all resources present in the source, where the key is a resource name. - -#### selected\_resources - -```python -@property -def selected_resources() -> Dict[str, DltResource] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L645) - -A dictionary of all the resources that are selected to be loaded. - -#### discover\_schema - -```python -def discover_schema(item: TDataItem = None) -> Schema -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L657) - -Computes table schemas for all selected resources in the source and merges them with a copy of current source schema. If `item` is provided, -dynamic tables will be evaluated, otherwise those tables will be ignored. - -#### with\_resources - -```python -def with_resources(*resource_names: str) -> "DltSource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L670) - -A convenience method to select one of more resources to be loaded. Returns a clone of the original source with the specified resources selected. - -#### decompose - -```python -def decompose(strategy: TDecompositionStrategy) -> List["DltSource"] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L676) - -Decomposes source into a list of sources with a given strategy. - -"none" will return source as is -"scc" will decompose the dag of selected pipes and their parent into strongly connected components - -#### add\_limit - -```python -def add_limit(max_items: int) -> "DltSource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L693) - -Adds a limit `max_items` yielded from all selected resources in the source that are not transformers. - -This is useful for testing, debugging and generating sample datasets for experimentation. You can easily get your test dataset in a few minutes, when otherwise -you'd need to wait hours for the full loading to complete. - -**Arguments**: - -- `max_items` _int_ - The maximum number of items to yield - -**Returns**: - -- `"DltSource"` - returns self - -#### run - -```python -@property -def run() -> SupportsPipelineRun -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L709) - -A convenience method that will call `run` run on the currently active `dlt` pipeline. If pipeline instance is not found, one with default settings will be created. - -#### state - -```python -@property -def state() -> StrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L715) - -Gets source-scoped state from the active pipeline. PipelineStateNotAvailable is raised if no pipeline is active - -#### clone - -```python -def clone() -> "DltSource" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L720) - -Creates a deep copy of the source where copies of schema, resources and pipes are created - -#### \_\_iter\_\_ - -```python -def __iter__() -> Iterator[TDataItem] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/source.py#L725) - -Opens iterator that yields the data items from all the resources within the source in the same order as in Pipeline class. - -A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. - -A source config section is injected to allow secrets/config injection as during regular extraction. - diff --git a/docs/website/docs/api_reference/extract/typing.md b/docs/website/docs/api_reference/extract/typing.md deleted file mode 100644 index e77bc9b0d4..0000000000 --- a/docs/website/docs/api_reference/extract/typing.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -sidebar_label: typing -title: extract.typing ---- - -## SupportsPipe Objects - -```python -class SupportsPipe(Protocol) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/typing.py#L38) - -A protocol with the core Pipe properties and operations - -#### name - -Pipe name which is inherited by a resource - -## ItemTransform Objects - -```python -class ItemTransform(ABC, Generic[TAny]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/typing.py#L48) - -#### \_\_call\_\_ - -```python -@abstractmethod -def __call__(item: TDataItems, meta: Any = None) -> Optional[TDataItems] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/typing.py#L65) - -Transforms `item` (a list of TDataItem or a single TDataItem) and returns or yields TDataItems. Returns None to consume item (filter out) - diff --git a/docs/website/docs/api_reference/extract/utils.md b/docs/website/docs/api_reference/extract/utils.md deleted file mode 100644 index b757b27ad1..0000000000 --- a/docs/website/docs/api_reference/extract/utils.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -sidebar_label: utils -title: extract.utils ---- - -#### resolve\_column\_value - -```python -def resolve_column_value(column_hint: TTableHintTemplate[TColumnNames], - item: TDataItem) -> Union[Any, List[Any]] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/utils.py#L15) - -Extract values from the data item given a column hint. -Returns either a single value or list of values when hint is a composite. - -#### ensure\_table\_schema\_columns - -```python -def ensure_table_schema_columns( - columns: TAnySchemaColumns) -> TTableSchemaColumns -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/utils.py#L25) - -Convert supported column schema types to a column dict which -can be used in resource schema. - -**Arguments**: - -- `columns` - A dict of column schemas, a list of column schemas, or a pydantic model - -#### ensure\_table\_schema\_columns\_hint - -```python -def ensure_table_schema_columns_hint( - columns: TTableHintTemplate[TAnySchemaColumns] -) -> TTableHintTemplate[TTableSchemaColumns] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/extract/utils.py#L48) - -Convert column schema hint to a hint returning `TTableSchemaColumns`. -A callable hint is wrapped in another function which converts the original result. - diff --git a/docs/website/docs/api_reference/helpers/airflow_helper.md b/docs/website/docs/api_reference/helpers/airflow_helper.md deleted file mode 100644 index 7f14b2ff4b..0000000000 --- a/docs/website/docs/api_reference/helpers/airflow_helper.md +++ /dev/null @@ -1,115 +0,0 @@ ---- -sidebar_label: airflow_helper -title: helpers.airflow_helper ---- - -## PipelineTasksGroup Objects - -```python -class PipelineTasksGroup(TaskGroup) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/airflow_helper.py#L39) - -Represents a DLT Airflow pipeline task group. - -#### \_\_init\_\_ - -```python -def __init__(pipeline_name: str, - use_data_folder: bool = False, - local_data_folder: str = None, - use_task_logger: bool = True, - log_progress_period: float = 30.0, - buffer_max_items: int = 1000, - retry_policy: Retrying = DEFAULT_RETRY_NO_RETRY, - retry_pipeline_steps: Sequence[TPipelineStep] = ("load", ), - fail_task_if_any_job_failed: bool = True, - abort_task_if_any_job_failed: bool = False, - wipe_local_data: bool = True, - save_load_info: bool = False, - save_trace_info: bool = False, - **kwargs: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/airflow_helper.py#L44) - -Creates a task group to which you can add pipeline runs - -The run environment is prepared as follows -- the .dlt folder (the project folder) is searched under `dags` as configured by Airflow -- the data folder where pipelines are stored is always unique - -The `data_folder` is available in certain Airflow deployments. In case of Composer, it is a location on the gcs bucket. `use_data_folder` is disabled and should be -enabled only when needed. The operations on bucket are non-atomic and way slower than on local storage and should be avoided. - -`fail_task_if_any_job_failed` will raise an exception if any of the loading jobs failed permanently and thus fail the current Airflow task. -This happens **after all dlt loading jobs executed**. See more here: https://dlthub.com/docs/running-in-production/running#failed-jobs - -`abort_task_if_any_job_failed` will abort the other dlt loading jobs and fail the Airflow task in any of the jobs failed. This may put your warehouse in -inconsistent state so the option is disabled by default. - -The load info and trace info can be optionally saved to the destination. See https://dlthub.com/docs/running-in-production/running#inspect-and-save-the-load-info-and-trace - -**Arguments**: - -- `pipeline_name` _str_ - Name of the task group -- `use_data_folder` _bool, optional_ - If well defined 'data' folder is present it will be used. Currently only data folder on Composer is supported. Defaults to False. -- `local_data_folder` _str, optional_ - Path to a local folder on worker machine to where to store data. Used if local_data_folder is False or there's not well defined data folder. Defaults to gettempdir. -- `use_task_logger` _bool, optional_ - Will redirect dlt logger into task logger. Defaults to True. -- `log_progress_period` _float, optional_ - If progress is not configured for a pipeline, the `log` progress is used with a given period. Set 0 to disable. Defaults to 30.0. -- `buffer_max_items` _int, optional_ - Maximum number of buffered items. Use 0 to keep dlt built-in limit. Defaults to 1000. -- `retry_policy` __type_, optional_ - Tenacity retry policy. Defaults to no retry. -- `retry_pipeline_steps` _Sequence[TPipelineStep], optional_ - Which pipeline steps are eligible for retry. Defaults to ("load", ). -- `fail_task_if_any_job_failed` _bool, optional_ - Will fail a task if any of the dlt load jobs failed. Defaults to True. -- `wipe_local_data` _bool, optional_ - Will wipe all the data created by pipeline, also in case of exception. Defaults to False. -- `save_load_info` _bool, optional_ - Will save extensive load info to the destination. Defaults to False. -- `save_trace_info` _bool, optional_ - Will save trace info to the destination. Defaults to False. - -#### add\_run - -```python -@with_telemetry("helper", "airflow_add_run", False, "decompose") -def add_run(pipeline: Pipeline, - data: Any, - *, - decompose: Literal["none", "serialize"] = "none", - table_name: str = None, - write_disposition: TWriteDisposition = None, - **kwargs: Any) -> List[PythonOperator] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/airflow_helper.py#L124) - -Creates a task or a group of tasks to run `data` with `pipeline` - -Creates an Airflow task that extracts, normalizes and loads `data` with the passed pipeline instance `pipeline`. If `data` is a source -and `decompose` is `serialize` it will decompose the source into disjoint connected components (isolated group of resources) and execute them -one after another as separate Airflow tasks. The decomposition makes sure that each resource or transformer is extracted only once. It preserves -the order of resources declared in the source when creating graph of tasks. - -The `kwargs` are passed as arguments to all Airflow task instances created. - -**Arguments**: - -- `pipeline` _Pipeline_ - An instance of pipeline used to run the source -- `data` _Any_ - Any data supported by `run` method of the pipeline -- `decompose` _Literal["none", "serialize"], optional_ - A source decomposition strategy into Airflow tasks. Defaults to "none". -- `table_name` - (str): The name of the table to which the data should be loaded within the `dataset` -- `write_disposition` _TWriteDisposition, optional_ - Same as in `run` command. Defaults to None. - - -**Returns**: - -- `Any` - Airflow tasks created in order of creation - -#### add\_fun - -```python -def add_fun(f: Callable[..., Any], **kwargs: Any) -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/airflow_helper.py#L257) - -Will execute a function `f` inside an Airflow task. It is up to the function to create pipeline and source(s) - diff --git a/docs/website/docs/api_reference/helpers/dbt/configuration.md b/docs/website/docs/api_reference/helpers/dbt/configuration.md deleted file mode 100644 index b325d3e909..0000000000 --- a/docs/website/docs/api_reference/helpers/dbt/configuration.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -sidebar_label: configuration -title: helpers.dbt.configuration ---- - -## DBTRunnerConfiguration Objects - -```python -@configspec -class DBTRunnerConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/dbt/configuration.py#L10) - -#### package\_repository\_ssh\_key - -the default is empty value which will disable custom SSH KEY - diff --git a/docs/website/docs/api_reference/helpers/dbt/runner.md b/docs/website/docs/api_reference/helpers/dbt/runner.md deleted file mode 100644 index 961e34817b..0000000000 --- a/docs/website/docs/api_reference/helpers/dbt/runner.md +++ /dev/null @@ -1,125 +0,0 @@ ---- -sidebar_label: runner -title: helpers.dbt.runner ---- - -## DBTPackageRunner Objects - -```python -class DBTPackageRunner() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/dbt/runner.py#L25) - -A Python wrapper over a dbt package - -The created wrapper minimizes the required effort to run `dbt` packages on datasets created with `dlt`. It clones the package repo and keeps it up to data, -shares the `dlt` destination credentials with `dbt` and allows the isolated execution with `venv` parameter. -The wrapper creates a `dbt` profile from a passed `dlt` credentials and executes the transformations in `source_dataset_name` schema. Additional configuration is -passed via DBTRunnerConfiguration instance - -#### ensure\_newest\_package - -```python -def ensure_newest_package() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/dbt/runner.py#L87) - -Clones or brings the dbt package at `package_location` up to date. - -#### run - -```python -def run(cmd_params: Sequence[str] = ("--fail-fast", ), - additional_vars: StrAny = None, - destination_dataset_name: str = None) -> Sequence[DBTNodeResult] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/dbt/runner.py#L137) - -Runs `dbt` package - -Executes `dbt run` on previously cloned package. - -**Arguments**: - -- `run_params` _Sequence[str], optional_ - Additional parameters to `run` command ie. `full-refresh`. Defaults to ("--fail-fast", ). -- `additional_vars` _StrAny, optional_ - Additional jinja variables to be passed to the package. Defaults to None. -- `destination_dataset_name` _str, optional_ - Overwrites the dbt schema where transformed models will be created. Useful for testing or creating several copies of transformed data . Defaults to None. - - -**Returns**: - -- `Sequence[DBTNodeResult]` - A list of processed model with names, statuses, execution messages and execution times - - Exceptions: -- `DBTProcessingError` - `run` command failed. Contains a list of models with their execution statuses and error messages - -#### test - -```python -def test(cmd_params: Sequence[str] = None, - additional_vars: StrAny = None, - destination_dataset_name: str = None) -> Sequence[DBTNodeResult] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/dbt/runner.py#L159) - -Tests `dbt` package - -Executes `dbt test` on previously cloned package. - -**Arguments**: - -- `run_params` _Sequence[str], optional_ - Additional parameters to `test` command ie. test selectors`. -- `additional_vars` _StrAny, optional_ - Additional jinja variables to be passed to the package. Defaults to None. -- `destination_dataset_name` _str, optional_ - Overwrites the dbt schema where transformed models will be created. Useful for testing or creating several copies of transformed data . Defaults to None. - - -**Returns**: - -- `Sequence[DBTNodeResult]` - A list of executed tests with names, statuses, execution messages and execution times - - Exceptions: -- `DBTProcessingError` - `test` command failed. Contains a list of models with their execution statuses and error messages - -#### run\_all - -```python -def run_all(run_params: Sequence[str] = ("--fail-fast", ), - additional_vars: StrAny = None, - source_tests_selector: str = None, - destination_dataset_name: str = None) -> Sequence[DBTNodeResult] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/dbt/runner.py#L212) - -Prepares and runs a dbt package. - -This method executes typical `dbt` workflow with following steps: -1. First it clones the package or brings it up to date with the origin. If package location is a local path, it stays intact -2. It installs the dependencies (`dbt deps`) -3. It runs seed (`dbt seed`) -4. It runs optional tests on the sources -5. It runs the package (`dbt run`) -6. If the `dbt` fails with "incremental model out of sync", it will retry with full-refresh on (only when `auto_full_refresh_when_out_of_sync` is set). -See https://docs.getdbt.com/docs/build/incremental-models#what-if-the-columns-of-my-incremental-model-change - -**Arguments**: - -- `run_params` _Sequence[str], optional_ - Additional parameters to `run` command ie. `full-refresh`. Defaults to ("--fail-fast", ). -- `additional_vars` _StrAny, optional_ - Additional jinja variables to be passed to the package. Defaults to None. -- `source_tests_selector` _str, optional_ - A source tests selector ie. will execute all tests from `sources` model. Defaults to None. -- `destination_dataset_name` _str, optional_ - Overwrites the dbt schema where transformed models will be created. Useful for testing or creating several copies of transformed data . Defaults to None. - - -**Returns**: - -- `Sequence[DBTNodeResult]` - A list of processed model with names, statuses, execution messages and execution times - - Exceptions: -- `DBTProcessingError` - any of the dbt commands failed. Contains a list of models with their execution statuses and error messages -- `PrerequisitesException` - the source tests failed -- `IncrementalSchemaOutOfSyncError` - `run` failed due to schema being out of sync. the DBTProcessingError with failed model is in `args[0]` - diff --git a/docs/website/docs/api_reference/helpers/pandas_helper.md b/docs/website/docs/api_reference/helpers/pandas_helper.md deleted file mode 100644 index be08318ecf..0000000000 --- a/docs/website/docs/api_reference/helpers/pandas_helper.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -sidebar_label: pandas_helper -title: helpers.pandas_helper ---- - -#### query\_results\_to\_df - -```python -@deprecated( - reason="Use `df` method on cursor returned from client.execute_query") -def query_results_to_df(client: SqlClientBase[Any], - query: str, - index_col: Any = None, - coerce_float: bool = True, - parse_dates: Any = None, - dtype: Any = None) -> pd.DataFrame -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/pandas_helper.py#L16) - -A helper function that executes a query in the destination and returns the result as Pandas `DataFrame` - -This method reuses `read_sql` method of `Pandas` with the sql client obtained from `Pipeline.sql_client` method. - -Parameters ----------- -client (SqlClientBase[Any]): Sql Client instance -query (str): Query to be executed -index_col str or list of str, optional, default: None - Column(s) to set as index(MultiIndex). -coerce_float (bool, optional): default: True - Attempts to convert values of non-string, non-numeric objects (like - decimal.Decimal) to floating point. Useful for SQL result sets. -parse_dates : list or dict, default: None - - List of column names to parse as dates. - - Dict of ``{column_name: format string}`` where format string is - strftime compatible in case of parsing string times, or is one of - (D, s, ns, ms, us) in case of parsing integer timestamps. - - Dict of ``{column_name: arg dict}``, where the arg dict corresponds - to the keyword arguments of :func:`pandas.to_datetime` - Especially useful with databases without native Datetime support, - such as SQLite. -dtype : Type name or dict of columns - Data type for data or columns. E.g. np.float64 or - {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’}. - -Returns -------- -DataFrame with the query results - diff --git a/docs/website/docs/api_reference/helpers/streamlit_helper.md b/docs/website/docs/api_reference/helpers/streamlit_helper.md deleted file mode 100644 index 387f8b99ce..0000000000 --- a/docs/website/docs/api_reference/helpers/streamlit_helper.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -sidebar_label: streamlit_helper -title: helpers.streamlit_helper ---- - -#### write\_load\_status\_page - -```python -def write_load_status_page(pipeline: Pipeline) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/streamlit_helper.py#L96) - -Display pipeline loading information. Will be moved to dlt package once tested - -#### write\_data\_explorer\_page - -```python -def write_data_explorer_page(pipeline: Pipeline, - schema_name: str = None, - show_dlt_tables: bool = False, - example_query: str = "", - show_charts: bool = True) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/helpers/streamlit_helper.py#L205) - -Writes Streamlit app page with a schema and live data preview. - -### Args: -pipeline (Pipeline): Pipeline instance to use. -schema_name (str, optional): Name of the schema to display. If None, default schema is used. -show_dlt_tables (bool, optional): Should show DLT internal tables. Defaults to False. -example_query (str, optional): Example query to be displayed in the SQL Query box. -show_charts (bool, optional): Should automatically show charts for the queries from SQL Query box. Defaults to True. - -**Raises**: - -- `MissingDependencyException` - Raised when a particular python dependency is not installed - diff --git a/docs/website/docs/api_reference/load/configuration.md b/docs/website/docs/api_reference/load/configuration.md deleted file mode 100644 index b2d486e91a..0000000000 --- a/docs/website/docs/api_reference/load/configuration.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -sidebar_label: configuration -title: load.configuration ---- - -## LoaderConfiguration Objects - -```python -@configspec -class LoaderConfiguration(PoolRunnerConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/load/configuration.py#L9) - -#### workers - -how many parallel loads can be executed - -#### pool\_type - -mostly i/o (upload) so may be thread pool - -#### raise\_on\_failed\_jobs - -when True, raises on terminally failed jobs immediately - -#### raise\_on\_max\_retries - -When gt 0 will raise when job reaches raise_on_max_retries - diff --git a/docs/website/docs/api_reference/load/load.md b/docs/website/docs/api_reference/load/load.md deleted file mode 100644 index ce60b6e2a7..0000000000 --- a/docs/website/docs/api_reference/load/load.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -sidebar_label: load -title: load.load ---- - -## Load Objects - -```python -class Load(Runnable[ThreadPool]) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/load/load.py#L32) - -#### maybe\_with\_staging\_dataset - -```python -@contextlib.contextmanager -def maybe_with_staging_dataset(job_client: JobClientBase, - table: TTableSchema) -> Iterator[None] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/load/load.py#L95) - -Executes job client methods in context of staging dataset if `table` has `write_disposition` that requires it - -#### get\_completed\_table\_chain - -```python -def get_completed_table_chain( - load_id: str, - schema: Schema, - top_merged_table: TTableSchema, - being_completed_job_id: str = None) -> List[TTableSchema] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/load/load.py#L184) - -Gets a table chain starting from the `top_merged_table` containing only tables with completed/failed jobs. None is returned if there's any job that is not completed - -Optionally `being_completed_job_id` can be passed that is considered to be completed before job itself moves in storage - -#### get\_table\_chain\_tables\_for\_write\_disposition - -```python -def get_table_chain_tables_for_write_disposition( - load_id: str, schema: Schema, - dispositions: List[TWriteDisposition]) -> Set[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/load/load.py#L281) - -Get all jobs for tables with given write disposition and resolve the table chain - diff --git a/docs/website/docs/api_reference/normalize/configuration.md b/docs/website/docs/api_reference/normalize/configuration.md deleted file mode 100644 index 5f5b8bc42f..0000000000 --- a/docs/website/docs/api_reference/normalize/configuration.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -sidebar_label: configuration -title: normalize.configuration ---- - -## NormalizeConfiguration Objects - -```python -@configspec -class NormalizeConfiguration(PoolRunnerConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/normalize/configuration.py#L10) - -#### destination\_capabilities - -injectable - diff --git a/docs/website/docs/api_reference/normalize/normalize.md b/docs/website/docs/api_reference/normalize/normalize.md deleted file mode 100644 index cbd0899183..0000000000 --- a/docs/website/docs/api_reference/normalize/normalize.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -sidebar_label: normalize -title: normalize.normalize ---- - -#### TMapFuncType - -input parameters: (schema name, load_id, list of files to process) - diff --git a/docs/website/docs/api_reference/pipeline/__init__.md b/docs/website/docs/api_reference/pipeline/__init__.md deleted file mode 100644 index 47bb14c76d..0000000000 --- a/docs/website/docs/api_reference/pipeline/__init__.md +++ /dev/null @@ -1,164 +0,0 @@ ---- -sidebar_label: pipeline -title: pipeline ---- - -#### pipeline - -```python -@overload -def pipeline(pipeline_name: str = None, - pipelines_dir: str = None, - pipeline_salt: TSecretValue = None, - destination: TDestinationReferenceArg = None, - staging: TDestinationReferenceArg = None, - dataset_name: str = None, - import_schema_path: str = None, - export_schema_path: str = None, - full_refresh: bool = False, - credentials: Any = None, - progress: TCollectorArg = _NULL_COLLECTOR) -> Pipeline -``` - -Creates a new instance of `dlt` pipeline, which moves the data from the source i.e. a REST API to a destination i.e. database or a data lake. - -Summary: -The `pipeline` functions allows you to pass the destination name to which the data should be loaded, the name of the dataset and several other options that govern loading of the data. -The created `Pipeline` object lets you load the data from any source with `run` method or to have more granular control over the loading process with `extract`, `normalize` and `load` methods. - -Please refer to the following doc pages: -- Write your first pipeline walkthrough: https://dlthub.com/docs/walkthroughs/create-a-pipeline -- Pipeline architecture and data loading steps: https://dlthub.com/docs/reference -- List of supported destinations: https://dlthub.com/docs/dlt-ecosystem/destinations - -**Arguments**: - -- `pipeline_name` _str, optional_ - A name of the pipeline that will be used to identify it in monitoring events and to restore its state and data schemas on subsequent runs. - Defaults to the file name of a pipeline script with `dlt_` prefix added. - -- `pipelines_dir` _str, optional_ - A working directory in which pipeline state and temporary files will be stored. Defaults to user home directory: `~/dlt/pipelines/`. - -- `pipeline_salt` _TSecretValue, optional_ - A random value used for deterministic hashing during data anonymization. Defaults to a value derived from the pipeline name. - Default value should not be used for any cryptographic purposes. - -- `destination` _str | DestinationReference, optional_ - A name of the destination to which dlt will load the data, or a destination module imported from `dlt.destination`. - May also be provided to `run` method of the `pipeline`. - -- `staging` _str | DestinationReference, optional_ - A name of the destination where dlt will stage the data before final loading, or a destination module imported from `dlt.destination`. - May also be provided to `run` method of the `pipeline`. - -- `dataset_name` _str, optional_ - A name of the dataset to which the data will be loaded. A dataset is a logical group of tables i.e. `schema` in relational databases or folder grouping many files. - May also be provided later to the `run` or `load` methods of the `Pipeline`. If not provided at all, then default to the `pipeline_name` - -- `import_schema_path` _str, optional_ - A path from which the schema `yaml` file will be imported on each pipeline run. Defaults to None which disables importing. - -- `export_schema_path` _str, optional_ - A path where the schema `yaml` file will be exported after every schema change. Defaults to None which disables exporting. - -- `full_refresh` _bool, optional_ - When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset. - The datasets are identified by `dataset_name_` + datetime suffix. Use this setting whenever you experiment with your data to be sure you start fresh on each run. Defaults to False. - -- `credentials` _Any, optional_ - Credentials for the `destination` i.e. database connection string or a dictionary with Google cloud credentials. - In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer the right credentials values. - -- `progress` _str, Collector_ - A progress monitor that shows progress bars, console or log messages with current information on sources, resources, data items etc. processed in - `extract`, `normalize` and `load` stage. Pass a string with a collector name or configure your own by choosing from `dlt.progress` module. - We support most of the progress libraries: try passing `tqdm`, `enlighten` or `alive_progress` or `log` to write to console/log. - - -**Returns**: - -- `Pipeline` - An instance of `Pipeline` class with. Please check the documentation of `run` method for information on what to do with it. - -#### pipeline - -```python -@overload -def pipeline() -> Pipeline -``` - -When called without any arguments, returns the recently created `Pipeline` instance. -If not found, it creates a new instance with all the pipeline options set to defaults. - -#### attach - -```python -@with_config(spec=PipelineConfiguration, auto_pipeline_section=True) -def attach(pipeline_name: str = None, - pipelines_dir: str = None, - pipeline_salt: TSecretValue = None, - full_refresh: bool = False, - credentials: Any = None, - progress: TCollectorArg = _NULL_COLLECTOR, - **kwargs: Any) -> Pipeline -``` - -Attaches to the working folder of `pipeline_name` in `pipelines_dir` or in the default directory. Requires that valid pipeline state exists in the working folder. - -#### run - -```python -def run(data: Any, - *, - destination: TDestinationReferenceArg = None, - staging: TDestinationReferenceArg = None, - dataset_name: str = None, - credentials: Any = None, - table_name: str = None, - write_disposition: TWriteDisposition = None, - columns: Sequence[TColumnSchema] = None, - schema: Schema = None) -> LoadInfo -``` - -Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. - -Summary: -This method will `extract` the data from the `data` argument, infer the schema, `normalize` the data into a load package (i.e. jsonl or PARQUET files representing tables) and then `load` such packages into the `destination`. - -The data may be supplied in several forms: -- a `list` or `Iterable` of any JSON-serializable objects i.e. `dlt.run([1, 2, 3], table_name="numbers")` -- any `Iterator` or a function that yield (`Generator`) i.e. `dlt.run(range(1, 10), table_name="range")` -- a function or a list of functions decorated with @dlt.resource i.e. `dlt.run([chess_players(title="GM"), chess_games()])` -- a function or a list of functions decorated with @dlt.source. - -Please note that `dlt` deals with `bytes`, `datetime`, `decimal` and `uuid` objects, so you are free to load binary data or documents containing dates. - -Execution: -The `run` method will first use `sync_destination` method to synchronize pipeline state and schemas with the destination. You can disable this behavior with `restore_from_destination` configuration option. -Next, it will make sure that data from the previous is fully processed. If not, `run` method normalizes and loads pending data items. -Only then the new data from `data` argument is extracted, normalized and loaded. - -**Arguments**: - -- `data` _Any_ - Data to be loaded to destination. - -- `destination` _str | DestinationReference, optional_ - A name of the destination to which dlt will load the data, or a destination module imported from `dlt.destination`. - If not provided, the value passed to `dlt.pipeline` will be used. - -- `dataset_name` _str, optional_ - A name of the dataset to which the data will be loaded. A dataset is a logical group of tables i.e. `schema` in relational databases or folder grouping many files. - If not provided, the value passed to `dlt.pipeline` will be used. If not provided at all, then default to the `pipeline_name` - -- `credentials` _Any, optional_ - Credentials for the `destination` i.e. database connection string or a dictionary with Google cloud credentials. - In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer the right credentials values. - -- `table_name` _str, optional_ - The name of the table to which the data should be loaded within the `dataset`. This argument is required for a `data` that is a list/Iterable or Iterator without `__name__` attribute. - The behavior of this argument depends on the type of the `data`: - * generator functions: the function name is used as table name, `table_name` overrides this default - * `@dlt.resource`: resource contains the full table schema, and that includes the table name. `table_name` will override this property. Use with care! - * `@dlt.source`: source contains several resources each with a table schema. `table_name` will override all table names within the source and load the data into a single table. - -- `write_disposition` _Literal["skip", "append", "replace", "merge"], optional_ - Controls how to write data to a table. `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". - Please note that in case of `dlt.resource` the table schema value will be overwritten and in case of `dlt.source`, the values in all resources will be overwritten. - -- `columns` _Sequence[TColumnSchema], optional_ - A list of column schemas. Typed dictionary describing column names, data types, write disposition and performance hints that gives you full control over the created table schema. - -- `schema` _Schema, optional_ - An explicit `Schema` object in which all table schemas will be grouped. By default, `dlt` takes the schema from the source (if passed in `data` argument) or creates a default one itself. - - -**Raises**: - - PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. - -**Returns**: - -- `LoadInfo` - Information on loaded data including the list of package ids and failed job statuses. Please note that `dlt` will not raise if a single job terminally fails. Such information is provided via LoadInfo. - diff --git a/docs/website/docs/api_reference/pipeline/configuration.md b/docs/website/docs/api_reference/pipeline/configuration.md deleted file mode 100644 index bc5d0a80a9..0000000000 --- a/docs/website/docs/api_reference/pipeline/configuration.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -sidebar_label: configuration -title: pipeline.configuration ---- - -## PipelineConfiguration Objects - -```python -@configspec -class PipelineConfiguration(BaseConfiguration) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/configuration.py#L11) - -#### restore\_from\_destination - -Enables the `run` method of the `Pipeline` object to restore the pipeline state and schemas from the destination - -#### enable\_runtime\_trace - -Enables the tracing. Tracing saves the execution trace locally and is required by `dlt deploy`. - -#### use\_single\_dataset - -Stores all schemas in single dataset. When False, each schema will get a separate dataset with `{dataset_name}_{schema_name} - -#### full\_refresh - -When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset. - diff --git a/docs/website/docs/api_reference/pipeline/current.md b/docs/website/docs/api_reference/pipeline/current.md deleted file mode 100644 index 6474b72511..0000000000 --- a/docs/website/docs/api_reference/pipeline/current.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -sidebar_label: current -title: pipeline.current ---- - -Easy access to active pipelines, state, sources and schemas - -#### pipeline - -Alias for dlt.pipeline - -#### state - -Alias for dlt.state - diff --git a/docs/website/docs/api_reference/pipeline/dbt.md b/docs/website/docs/api_reference/pipeline/dbt.md deleted file mode 100644 index 663a5a9c80..0000000000 --- a/docs/website/docs/api_reference/pipeline/dbt.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -sidebar_label: dbt -title: pipeline.dbt ---- - -#### get\_venv - -```python -def get_venv(pipeline: Pipeline, - venv_path: str = "dbt", - dbt_version: str = _DEFAULT_DBT_VERSION) -> Venv -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/dbt.py#L13) - -Creates or restores a virtual environment in which the `dbt` packages are executed. - -The recommended way to execute dbt package is to use a separate virtual environment where only the dbt-core -and required destination dependencies are installed. This avoid dependency clashes with the user-installed libraries. -This method will create such environment at the location specified in `venv_path` and automatically install required dependencies -as required by `pipeline`. - -**Arguments**: - -- `pipeline` _Pipeline_ - A pipeline for which the required dbt dependencies are inferred -- `venv_path` _str, optional_ - A path where virtual environment is created or restored from. - If relative path is provided, the environment will be created within pipeline's working directory. Defaults to "dbt". -- `dbt_version` _str, optional_ - Version of dbt to be used. Exact version (ie. "1.2.4") or pip requirements string (ie. ">=1.1<1.5" may be provided). - - -**Returns**: - -- `Venv` - A Virtual Environment with dbt dependencies installed - -#### package - -```python -def package(pipeline: Pipeline, - package_location: str, - package_repository_branch: str = None, - package_repository_ssh_key: TSecretValue = TSecretValue(""), - auto_full_refresh_when_out_of_sync: bool = None, - venv: Venv = None) -> DBTPackageRunner -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/dbt.py#L44) - -Creates a Python wrapper over `dbt` package present at specified location, that allows to control it (ie. run and test) from Python code. - -The created wrapper minimizes the required effort to run `dbt` packages on datasets created with `dlt`. It clones the package repo and keeps it up to data, -shares the `dlt` destination credentials with `dbt` and allows the isolated execution with `venv` parameter. -The wrapper creates a `dbt` profile from `dlt` pipeline configuration. Specifically: -1. destination is used to infer correct dbt profile -2. destinations credentials are passed to dbt via environment variables -3. dataset_name is used to configure the dbt database schema - -**Arguments**: - -- `pipeline` _Pipeline_ - A pipeline containing destination, credentials and dataset_name used to configure the dbt package. -- `package_location` _str_ - A git repository url to be cloned or a local path where dbt package is present -- `package_repository_branch` _str, optional_ - A branch name, tag name or commit-id to check out. Defaults to None. -- `package_repository_ssh_key` _TSecretValue, optional_ - SSH key to be used to clone private repositories. Defaults to TSecretValue(""). -- `auto_full_refresh_when_out_of_sync` _bool, optional_ - If set to True (default), the wrapper will automatically fall back to full-refresh mode when schema is out of sync -- `See` - https://docs.getdbt.com/docs/build/incremental-models#what-if-the-columns-of-my-incremental-model-change -- `venv` _Venv, optional_ - A virtual environment with required dbt dependencies. Defaults to None which will execute dbt package in current environment. - - -**Returns**: - -- `DBTPackageRunner` - A configured and authenticated Python `dbt` wrapper - diff --git a/docs/website/docs/api_reference/pipeline/helpers.md b/docs/website/docs/api_reference/pipeline/helpers.md deleted file mode 100644 index 988177e52e..0000000000 --- a/docs/website/docs/api_reference/pipeline/helpers.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -sidebar_label: helpers -title: pipeline.helpers ---- - -#### retry\_load - -```python -def retry_load(retry_on_pipeline_steps: Sequence[TPipelineStep] = ( - "load", )) -> Callable[[BaseException], bool] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/helpers.py#L19) - -A retry strategy for Tenacity that, with default setting, will repeat `load` step for all exceptions that are not terminal - -Use this condition with tenacity `retry_if_exception`. Terminal exceptions are exceptions that will not go away when operations is repeated. -Examples: missing configuration values, Authentication Errors, terminally failed jobs exceptions etc. - ->>> data = source(...) ->>> for attempt in Retrying(stop=stop_after_attempt(3), retry=retry_if_exception(retry_load(())), reraise=True): ->>> with attempt: ->>> p.run(data) - -**Arguments**: - -- `retry_on_pipeline_steps` _Tuple[TPipelineStep, ...], optional_ - which pipeline steps are allowed to be repeated. Default: "load" - diff --git a/docs/website/docs/api_reference/pipeline/pipeline.md b/docs/website/docs/api_reference/pipeline/pipeline.md deleted file mode 100644 index bf54e7a056..0000000000 --- a/docs/website/docs/api_reference/pipeline/pipeline.md +++ /dev/null @@ -1,418 +0,0 @@ ---- -sidebar_label: pipeline -title: pipeline.pipeline ---- - -## Pipeline Objects - -```python -class Pipeline(SupportsPipeline) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L149) - -#### pipeline\_name - -Name of the pipeline - -#### first\_run - -Indicates a first run of the pipeline, where run ends with successful loading of the data - -#### pipelines\_dir - -A directory where the pipelines' working directories are created - -#### working\_dir - -A working directory of the pipeline - -#### staging - -The destination reference which is ModuleType. `destination.__name__` returns the name string - -#### dataset\_name - -Name of the dataset to which pipeline will be loaded to - -#### is\_active - -Tells if instance is currently active and available via dlt.pipeline() - -#### \_\_init\_\_ - -```python -def __init__(pipeline_name: str, pipelines_dir: str, - pipeline_salt: TSecretValue, destination: DestinationReference, - staging: DestinationReference, dataset_name: str, - credentials: Any, import_schema_path: str, - export_schema_path: str, full_refresh: bool, progress: _Collector, - must_attach_to_local_pipeline: bool, - config: PipelineConfiguration, runtime: RunConfiguration) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L180) - -Initializes the Pipeline class which implements `dlt` pipeline. Please use `pipeline` function in `dlt` module to create a new Pipeline instance. - -#### drop - -```python -def drop() -> "Pipeline" -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L232) - -Deletes local pipeline state, schemas and any working files - -#### extract - -```python -@with_runtime_trace -@with_schemas_sync -@with_state_sync(may_extract_state=True) -@with_config_section((known_sections.EXTRACT, )) -def extract(data: Any, - *, - table_name: str = None, - parent_table_name: str = None, - write_disposition: TWriteDisposition = None, - columns: TAnySchemaColumns = None, - primary_key: TColumnNames = None, - schema: Schema = None, - max_parallel_items: int = None, - workers: int = None) -> ExtractInfo -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L258) - -Extracts the `data` and prepare it for the normalization. Does not require destination or credentials to be configured. See `run` method for the arguments' description. - -#### normalize - -```python -@with_runtime_trace -@with_schemas_sync -@with_config_section((known_sections.NORMALIZE, )) -def normalize(workers: int = 1, - loader_file_format: TLoaderFileFormat = None) -> NormalizeInfo -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L297) - -Normalizes the data prepared with `extract` method, infers the schema and creates load packages for the `load` method. Requires `destination` to be known. - -#### load - -```python -@with_runtime_trace -@with_schemas_sync -@with_state_sync() -@with_config_section((known_sections.LOAD, )) -def load(destination: TDestinationReferenceArg = None, - dataset_name: str = None, - credentials: Any = None, - *, - workers: int = 20, - raise_on_failed_jobs: bool = False) -> LoadInfo -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L331) - -Loads the packages prepared by `normalize` method into the `dataset_name` at `destination`, using provided `credentials` - -#### run - -```python -@with_runtime_trace -@with_config_section(("run", )) -def run(data: Any = None, - *, - destination: TDestinationReferenceArg = None, - staging: TDestinationReferenceArg = None, - dataset_name: str = None, - credentials: Any = None, - table_name: str = None, - write_disposition: TWriteDisposition = None, - columns: TAnySchemaColumns = None, - primary_key: TColumnNames = None, - schema: Schema = None, - loader_file_format: TLoaderFileFormat = None) -> LoadInfo -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L379) - -Loads the data from `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. - -### Summary -This method will `extract` the data from the `data` argument, infer the schema, `normalize` the data into a load package (ie. jsonl or PARQUET files representing tables) and then `load` such packages into the `destination`. - -The data may be supplied in several forms: -* a `list` or `Iterable` of any JSON-serializable objects ie. `dlt.run([1, 2, 3], table_name="numbers")` -* any `Iterator` or a function that yield (`Generator`) ie. `dlt.run(range(1, 10), table_name="range")` -* a function or a list of functions decorated with @dlt.resource ie. `dlt.run([chess_players(title="GM"), chess_games()])` -* a function or a list of functions decorated with @dlt.source. - -Please note that `dlt` deals with `bytes`, `datetime`, `decimal` and `uuid` objects so you are free to load documents containing ie. binary data or dates. - -### Execution -The `run` method will first use `sync_destination` method to synchronize pipeline state and schemas with the destination. You can disable this behavior with `restore_from_destination` configuration option. -Next it will make sure that data from the previous is fully processed. If not, `run` method normalizes, loads pending data items and **exits** -If there was no pending data, new data from `data` argument is extracted, normalized and loaded. - -### Args: -data (Any): Data to be loaded to destination - -destination (str | DestinationReference, optional): A name of the destination to which dlt will load the data, or a destination module imported from `dlt.destination`. -If not provided, the value passed to `dlt.pipeline` will be used. - -dataset_name (str, optional):A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. `schema` in relational databases or folder grouping many files. -If not provided, the value passed to `dlt.pipeline` will be used. If not provided at all then defaults to the `pipeline_name` - - -credentials (Any, optional): Credentials for the `destination` ie. database connection string or a dictionary with google cloud credentials. -In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer right credentials values. - -table_name (str, optional): The name of the table to which the data should be loaded within the `dataset`. This argument is required for a `data` that is a list/Iterable or Iterator without `__name__` attribute. -The behavior of this argument depends on the type of the `data`: -* generator functions: the function name is used as table name, `table_name` overrides this default -* `@dlt.resource`: resource contains the full table schema and that includes the table name. `table_name` will override this property. Use with care! -* `@dlt.source`: source contains several resources each with a table schema. `table_name` will override all table names within the source and load the data into single table. - -write_disposition (Literal["skip", "append", "replace", "merge"], optional): Controls how to write data to a table. `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". -Please note that in case of `dlt.resource` the table schema value will be overwritten and in case of `dlt.source`, the values in all resources will be overwritten. - -columns (Sequence[TColumnSchema], optional): A list of column schemas. Typed dictionary describing column names, data types, write disposition and performance hints that gives you full control over the created table schema. - -primary_key (str | Sequence[str]): A column name or a list of column names that comprise a private key. Typically used with "merge" write disposition to deduplicate loaded data. - -schema (Schema, optional): An explicit `Schema` object in which all table schemas will be grouped. By default `dlt` takes the schema from the source (if passed in `data` argument) or creates a default one itself. - -loader_file_format (Literal["jsonl", "insert_values", "parquet"], optional). The file format the loader will use to create the load package. Not all file_formats are compatible with all destinations. Defaults to the preferred file format of the selected destination. - -### Raises: -PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. -### Returns: -LoadInfo: Information on loaded data including the list of package ids and failed job statuses. Please not that `dlt` will not raise if a single job terminally fails. Such information is provided via LoadInfo. - -#### sync\_destination - -```python -@with_schemas_sync -def sync_destination(destination: TDestinationReferenceArg = None, - staging: TDestinationReferenceArg = None, - dataset_name: str = None) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L475) - -Synchronizes pipeline state with the `destination`'s state kept in `dataset_name` - -### Summary -Attempts to restore pipeline state and schemas from the destination. Requires the state that is present at the destination to have a higher version number that state kept locally in working directory. -In such a situation the local state, schemas and intermediate files with the data will be deleted and replaced with the state and schema present in the destination. - -A special case where the pipeline state exists locally but the dataset does not exist at the destination will wipe out the local state. - -Note: this method is executed by the `run` method before any operation on data. Use `restore_from_destination` configuration option to disable that behavior. - -#### activate - -```python -def activate() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L560) - -Activates the pipeline - -The active pipeline is used as a context for several `dlt` components. It provides state to sources and resources evaluated outside of -`pipeline.run` and `pipeline.extract` method. For example, if the source you use is accessing state in `dlt.source` decorated function, the state is provided -by active pipeline. - -The name of active pipeline is used when resolving secrets and config values as the optional most outer section during value lookup. For example if pipeline -with name `chess_pipeline` is active and `dlt` looks for `BigQuery` configuration, it will look in `chess_pipeline.destination.bigquery.credentials` first and then in -`destination.bigquery.credentials`. - -Active pipeline also provides the current DestinationCapabilitiesContext to other components ie. Schema instances. Among others, it sets the naming convention -and maximum identifier length. - -Only one pipeline is active at a given time. - -Pipeline created or attached with `dlt.pipeline`/'dlt.attach` is automatically activated. `run`, `load` and `extract` methods also activate pipeline. - -#### deactivate - -```python -def deactivate() -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L580) - -Deactivates the pipeline - -Pipeline must be active in order to use this method. Please refer to `activate()` method for the explanation of active pipeline concept. - -#### has\_data - -```python -@property -def has_data() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L590) - -Tells if the pipeline contains any data: schemas, extracted files, load packages or loaded packages in the destination - -#### has\_pending\_data - -```python -@property -def has_pending_data() -> bool -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L595) - -Tells if the pipeline contains any extracted files or pending load packages - -#### state - -```python -@property -def state() -> TPipelineState -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L608) - -Returns a dictionary with the pipeline state - -#### last\_trace - -```python -@property -def last_trace() -> PipelineTrace -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L613) - -Returns or loads last trace generated by pipeline. The trace is loaded from standard location. - -#### list\_extracted\_resources - -```python -def list_extracted_resources() -> Sequence[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L619) - -Returns a list of all the files with extracted resources that will be normalized. - -#### list\_normalized\_load\_packages - -```python -def list_normalized_load_packages() -> Sequence[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L623) - -Returns a list of all load packages ids that are or will be loaded. - -#### list\_completed\_load\_packages - -```python -def list_completed_load_packages() -> Sequence[str] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L627) - -Returns a list of all load package ids that are completely loaded - -#### get\_load\_package\_info - -```python -def get_load_package_info(load_id: str) -> LoadPackageInfo -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L631) - -Returns information on normalized/completed package with given load_id, all jobs and their statuses. - -#### list\_failed\_jobs\_in\_package - -```python -def list_failed_jobs_in_package(load_id: str) -> Sequence[LoadJobInfo] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L635) - -List all failed jobs and associated error messages for a specified `load_id` - -#### sync\_schema - -```python -@with_schemas_sync -def sync_schema(schema_name: str = None, - credentials: Any = None) -> TSchemaTables -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L640) - -Synchronizes the schema `schema_name` with the destination. If no name is provided, the default schema will be synchronized. - -#### set\_local\_state\_val - -```python -def set_local_state_val(key: str, value: Any) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L651) - -Sets value in local state. Local state is not synchronized with destination. - -#### get\_local\_state\_val - -```python -def get_local_state_val(key: str) -> Any -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L662) - -Gets value from local state. Local state is not synchronized with destination. - -#### sql\_client - -```python -def sql_client(schema_name: str = None, - credentials: Any = None) -> SqlClientBase[Any] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L671) - -Returns a sql client configured to query/change the destination and dataset that were used to load the data. -Use the client with `with` statement to manage opening and closing connection to the destination: ->>> with pipeline.sql_client() as client: ->>> with client.execute_query( ->>> "SELECT id, name, email FROM customers WHERE id = %s", 10 ->>> ) as cursor: ->>> print(cursor.fetchall()) - -The client is authenticated and defaults all queries to dataset_name used by the pipeline. You can provide alternative -`schema_name` which will be used to normalize dataset name and alternative `credentials`. - -#### destination\_client - -```python -def destination_client(schema_name: str = None, - credentials: Any = None) -> JobClientBase -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/pipeline.py#L693) - -Get the destination job client for the configured destination -Use the client with `with` statement to manage opening and closing connection to the destination: ->>> with pipeline.destination_client() as client: ->>> client.drop_storage() # removes storage which typically wipes all data in it - -The client is authenticated. You can provide alternative `schema_name` which will be used to normalize dataset name and alternative `credentials`. -If no schema name is provided and no default schema is present in the pipeline, and ad hoc schema will be created and discarded after use. - diff --git a/docs/website/docs/api_reference/pipeline/trace.md b/docs/website/docs/api_reference/pipeline/trace.md deleted file mode 100644 index 6758d7f487..0000000000 --- a/docs/website/docs/api_reference/pipeline/trace.md +++ /dev/null @@ -1,102 +0,0 @@ ---- -sidebar_label: trace -title: pipeline.trace ---- - -## SerializableResolvedValueTrace Objects - -```python -class SerializableResolvedValueTrace(NamedTuple) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/trace.py#L26) - -Information on resolved secret and config values - -#### asdict - -```python -def asdict() -> StrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/trace.py#L36) - -A dictionary representation that is safe to load. - -## \_PipelineStepTrace Objects - -```python -@dataclasses.dataclass(init=True) -class _PipelineStepTrace() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/trace.py#L48) - -#### step\_info - -A step outcome info ie. LoadInfo - -#### step\_exception - -For failing steps contains exception string - -## PipelineStepTrace Objects - -```python -class PipelineStepTrace(_PipelineStepTrace) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/trace.py#L80) - -Trace of particular pipeline step, contains timing information, the step outcome info or exception in case of failing step with custom asdict() - -#### asdict - -```python -def asdict() -> DictStrAny -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/trace.py#L82) - -A dictionary representation of PipelineStepTrace that can be loaded with `dlt` - -## PipelineTrace Objects - -```python -@dataclasses.dataclass(init=True) -class PipelineTrace() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/trace.py#L92) - -Pipeline runtime trace containing data on "extract", "normalize" and "load" steps and resolved config and secret values. - -#### steps - -A list of steps in the trace - -#### resolved\_config\_values - -A list of resolved config values - -#### merge\_traces - -```python -def merge_traces(last_trace: PipelineTrace, - new_trace: PipelineTrace) -> PipelineTrace -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/trace.py#L226) - -Merges `new_trace` into `last_trace` by combining steps and timestamps. `new_trace` replace the `last_trace` if it has more than 1 step.` - -#### describe\_extract\_data - -```python -def describe_extract_data(data: Any) -> List[ExtractDataInfo] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/trace.py#L255) - -Extract source and resource names from data passed to extract - diff --git a/docs/website/docs/api_reference/pipeline/track.md b/docs/website/docs/api_reference/pipeline/track.md deleted file mode 100644 index 12b7ee1089..0000000000 --- a/docs/website/docs/api_reference/pipeline/track.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -sidebar_label: track -title: pipeline.track ---- - -Implements SupportsTracking - -#### slack\_notify\_load\_success - -```python -def slack_notify_load_success(incoming_hook: str, load_info: LoadInfo, - trace: PipelineTrace) -> int -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/pipeline/track.py#L32) - -Sends a markdown formatted success message and returns http status code from the Slack incoming hook - diff --git a/docs/website/docs/api_reference/reflection/script_inspector.md b/docs/website/docs/api_reference/reflection/script_inspector.md deleted file mode 100644 index 65f8c8371c..0000000000 --- a/docs/website/docs/api_reference/reflection/script_inspector.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -sidebar_label: script_inspector -title: reflection.script_inspector ---- - -## DummyModule Objects - -```python -class DummyModule(ModuleType) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/reflection/script_inspector.py#L22) - -A dummy module from which you can import anything - -#### load\_script\_module - -```python -def load_script_module(module_path: str, - script_relative_path: str, - ignore_missing_imports: bool = False) -> ModuleType -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/reflection/script_inspector.py#L81) - -Loads a module in `script_relative_path` by splitting it into a script module (file part) and package (folders). `module_path` is added to sys.path -Optionally, missing imports will be ignored by importing a dummy module instead. - diff --git a/docs/website/docs/api_reference/sidebar.json b/docs/website/docs/api_reference/sidebar.json deleted file mode 100644 index 0d2d2e5805..0000000000 --- a/docs/website/docs/api_reference/sidebar.json +++ /dev/null @@ -1,362 +0,0 @@ -{ - "items": [ - { - "items": [ - "api_reference/__init__/__init__" - ], - "label": "__init__", - "type": "category" - }, - { - "items": [ - "api_reference/cli/echo", - "api_reference/cli/pipeline_files", - "api_reference/cli/requirements" - ], - "label": "cli", - "type": "category" - }, - { - "items": [ - { - "items": [ - { - "items": [ - "api_reference/common/configuration/providers/toml" - ], - "label": "common.configuration.providers", - "type": "category" - }, - { - "items": [ - "api_reference/common/configuration/specs/api_credentials", - "api_reference/common/configuration/specs/aws_credentials", - "api_reference/common/configuration/specs/azure_credentials", - "api_reference/common/configuration/specs/base_configuration", - "api_reference/common/configuration/specs/config_providers_context", - "api_reference/common/configuration/specs/config_section_context", - "api_reference/common/configuration/specs/gcp_credentials", - "api_reference/common/configuration/specs/known_sections", - "api_reference/common/configuration/specs/run_configuration" - ], - "label": "common.configuration.specs", - "type": "category" - }, - "api_reference/common/configuration/accessors", - "api_reference/common/configuration/container", - "api_reference/common/configuration/exceptions", - "api_reference/common/configuration/inject", - "api_reference/common/configuration/paths", - "api_reference/common/configuration/resolve", - "api_reference/common/configuration/utils" - ], - "label": "common.configuration", - "type": "category" - }, - { - "items": [ - "api_reference/common/destination/capabilities", - "api_reference/common/destination/reference" - ], - "label": "common.destination", - "type": "category" - }, - { - "items": [ - "api_reference/common/json/__init__" - ], - "label": "common.json", - "type": "category" - }, - { - "items": [ - "api_reference/common/libs/pydantic" - ], - "label": "common.libs", - "type": "category" - }, - { - "items": [ - { - "items": [ - "api_reference/common/normalizers/json/__init__", - "api_reference/common/normalizers/json/relational" - ], - "label": "common.normalizers.json", - "type": "category" - }, - { - "items": [ - "api_reference/common/normalizers/naming/naming" - ], - "label": "common.normalizers.naming", - "type": "category" - }, - "api_reference/common/normalizers/configuration", - "api_reference/common/normalizers/typing", - "api_reference/common/normalizers/utils" - ], - "label": "common.normalizers", - "type": "category" - }, - { - "items": [ - "api_reference/common/reflection/utils" - ], - "label": "common.reflection", - "type": "category" - }, - { - "items": [ - "api_reference/common/runners/configuration", - "api_reference/common/runners/runnable", - "api_reference/common/runners/stdout", - "api_reference/common/runners/synth_pickle", - "api_reference/common/runners/venv" - ], - "label": "common.runners", - "type": "category" - }, - { - "items": [ - "api_reference/common/runtime/collector", - "api_reference/common/runtime/exec_info", - "api_reference/common/runtime/logger", - "api_reference/common/runtime/segment", - "api_reference/common/runtime/sentry", - "api_reference/common/runtime/signals", - "api_reference/common/runtime/slack", - "api_reference/common/runtime/telemetry" - ], - "label": "common.runtime", - "type": "category" - }, - { - "items": [ - "api_reference/common/schema/schema", - "api_reference/common/schema/typing", - "api_reference/common/schema/utils" - ], - "label": "common.schema", - "type": "category" - }, - { - "items": [ - "api_reference/common/storages/configuration", - "api_reference/common/storages/file_storage", - "api_reference/common/storages/filesystem", - "api_reference/common/storages/load_storage", - "api_reference/common/storages/normalize_storage", - "api_reference/common/storages/transactional_file" - ], - "label": "common.storages", - "type": "category" - }, - "api_reference/common/exceptions", - "api_reference/common/git", - "api_reference/common/jsonpath", - "api_reference/common/pipeline", - "api_reference/common/source", - "api_reference/common/time", - "api_reference/common/typing", - "api_reference/common/utils", - "api_reference/common/validation" - ], - "label": "common", - "type": "category" - }, - { - "items": [ - { - "items": [ - "api_reference/destinations/athena/athena", - "api_reference/destinations/athena/configuration" - ], - "label": "destinations.athena", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/bigquery/bigquery", - "api_reference/destinations/bigquery/configuration", - "api_reference/destinations/bigquery/sql_client" - ], - "label": "destinations.bigquery", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/duckdb/configuration", - "api_reference/destinations/duckdb/sql_client" - ], - "label": "destinations.duckdb", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/dummy/dummy" - ], - "label": "destinations.dummy", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/filesystem/configuration", - "api_reference/destinations/filesystem/filesystem" - ], - "label": "destinations.filesystem", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/motherduck/configuration" - ], - "label": "destinations.motherduck", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/mssql/configuration", - "api_reference/destinations/mssql/mssql" - ], - "label": "destinations.mssql", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/postgres/configuration" - ], - "label": "destinations.postgres", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/redshift/configuration", - "api_reference/destinations/redshift/redshift" - ], - "label": "destinations.redshift", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/snowflake/configuration", - "api_reference/destinations/snowflake/sql_client" - ], - "label": "destinations.snowflake", - "type": "category" - }, - { - "items": [ - "api_reference/destinations/weaviate/configuration", - "api_reference/destinations/weaviate/naming", - "api_reference/destinations/weaviate/weaviate_adapter", - "api_reference/destinations/weaviate/weaviate_client" - ], - "label": "destinations.weaviate", - "type": "category" - }, - "api_reference/destinations/insert_job_client", - "api_reference/destinations/job_client_impl", - "api_reference/destinations/job_impl", - "api_reference/destinations/path_utils", - "api_reference/destinations/sql_client", - "api_reference/destinations/sql_jobs", - "api_reference/destinations/typing" - ], - "label": "destinations", - "type": "category" - }, - { - "items": [ - "api_reference/extract/decorators", - "api_reference/extract/incremental", - "api_reference/extract/pipe", - "api_reference/extract/schema", - "api_reference/extract/source", - "api_reference/extract/typing", - "api_reference/extract/utils" - ], - "label": "extract", - "type": "category" - }, - { - "items": [ - { - "items": [ - "api_reference/helpers/dbt/configuration", - "api_reference/helpers/dbt/runner" - ], - "label": "helpers.dbt", - "type": "category" - }, - "api_reference/helpers/airflow_helper", - "api_reference/helpers/pandas_helper", - "api_reference/helpers/streamlit_helper" - ], - "label": "helpers", - "type": "category" - }, - { - "items": [ - "api_reference/load/configuration", - "api_reference/load/load" - ], - "label": "load", - "type": "category" - }, - { - "items": [ - "api_reference/normalize/configuration", - "api_reference/normalize/normalize" - ], - "label": "normalize", - "type": "category" - }, - { - "items": [ - "api_reference/pipeline/__init__", - "api_reference/pipeline/configuration", - "api_reference/pipeline/current", - "api_reference/pipeline/dbt", - "api_reference/pipeline/helpers", - "api_reference/pipeline/pipeline", - "api_reference/pipeline/trace", - "api_reference/pipeline/track" - ], - "label": "pipeline", - "type": "category" - }, - { - "items": [ - "api_reference/reflection/script_inspector" - ], - "label": "reflection", - "type": "category" - }, - { - "items": [ - { - "items": [ - { - "items": [ - "api_reference/sources/helpers/requests/__init__", - "api_reference/sources/helpers/requests/retry", - "api_reference/sources/helpers/requests/session" - ], - "label": "sources.helpers.requests", - "type": "category" - }, - "api_reference/sources/helpers/transform" - ], - "label": "sources.helpers", - "type": "category" - } - ], - "label": "sources", - "type": "category" - }, - "api_reference/version" - ], - "label": "dlt", - "type": "category" -} \ No newline at end of file diff --git a/docs/website/docs/api_reference/sources/helpers/requests/__init__.md b/docs/website/docs/api_reference/sources/helpers/requests/__init__.md deleted file mode 100644 index 95a8b06e4c..0000000000 --- a/docs/website/docs/api_reference/sources/helpers/requests/__init__.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -sidebar_label: requests -title: sources.helpers.requests ---- - -#### init - -```python -def init(config: RunConfiguration) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/sources/helpers/requests/__init__.py#L26) - -Initialize the default requests client from config - diff --git a/docs/website/docs/api_reference/sources/helpers/requests/retry.md b/docs/website/docs/api_reference/sources/helpers/requests/retry.md deleted file mode 100644 index 75112b6a11..0000000000 --- a/docs/website/docs/api_reference/sources/helpers/requests/retry.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -sidebar_label: retry -title: sources.helpers.requests.retry ---- - -## retry\_if\_status Objects - -```python -class retry_if_status(retry_base) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/sources/helpers/requests/retry.py#L36) - -Retry for given response status codes - -## Client Objects - -```python -class Client() -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/sources/helpers/requests/retry.py#L116) - -Wrapper for `requests` to create a `Session` with configurable retry functionality. - -### Summary -Create a `requests.Session` which automatically retries requests in case of error. -By default retries are triggered for `5xx` and `429` status codes and when the server is unreachable or drops connection. - -### Custom retry condition -You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception. -For example, this will trigger a retry when the response text is `error`: - ->>> from typing import Optional ->>> from requests import Response ->>> ->>> def should_retry(response: Optional[Response], exception: Optional[BaseException]) -> bool: ->>> if response is None: ->>> return False ->>> return response.text == 'error' - -The retry is triggered when either any of the predicates or the default conditions based on status code/exception are `True`. - -### Args: -request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds. -max_connections: Max connections per host in the HTTPAdapter pool -raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`) -session: Optional `requests.Session` instance to add the retry handler to. A new session is created by default. -status_codes: Retry when response has any of these status codes. Default `429` and all `5xx` codes. Pass an empty list to disable retry based on status. -exceptions: Retry on exception of given type(s). Default `(requests.Timeout, requests.ConnectionError)`. Pass an empty list to disable retry on exceptions. -request_max_attempts: Max number of retry attempts before giving up -retry_condition: A predicate or a list of predicates to decide whether to retry. If any predicate returns `True` the request is retried -request_backoff_factor: Multiplier used for exponential delay between retries -request_max_retry_delay: Maximum delay when using exponential backoff -respect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay -session_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes) - -#### update\_from\_config - -```python -def update_from_config(config: RunConfiguration) -> None -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/sources/helpers/requests/retry.py#L203) - -Update session/retry settings from RunConfiguration - diff --git a/docs/website/docs/api_reference/sources/helpers/requests/session.md b/docs/website/docs/api_reference/sources/helpers/requests/session.md deleted file mode 100644 index 767ad955b8..0000000000 --- a/docs/website/docs/api_reference/sources/helpers/requests/session.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -sidebar_label: session -title: sources.helpers.requests.session ---- - -## Session Objects - -```python -class Session(BaseSession) -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/sources/helpers/requests/session.py#L21) - -Requests session which by default adds a timeout to all requests and calls `raise_for_status()` on response - -### Args - timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds. - May be a single value or a tuple for separate (connect, read) timeout. - raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`) - diff --git a/docs/website/docs/api_reference/sources/helpers/transform.md b/docs/website/docs/api_reference/sources/helpers/transform.md deleted file mode 100644 index d6d4f4363b..0000000000 --- a/docs/website/docs/api_reference/sources/helpers/transform.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -sidebar_label: transform -title: sources.helpers.transform ---- - -#### take\_first - -```python -def take_first(max_items: int) -> ItemTransformFunctionNoMeta[bool] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/sources/helpers/transform.py#L5) - -A filter that takes only first `max_items` from a resource - -#### skip\_first - -```python -def skip_first(max_items: int) -> ItemTransformFunctionNoMeta[bool] -``` - -[[view_source]](https://github.com/dlt-hub/dlt/blob/30d0f64fb2cdbacc2e88fdb304371650f417e1f0/dlt/sources/helpers/transform.py#L15) - -A filter that skips first `max_items` from a resource - diff --git a/docs/website/docs/api_reference/version.md b/docs/website/docs/api_reference/version.md deleted file mode 100644 index 0d6adb3ace..0000000000 --- a/docs/website/docs/api_reference/version.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -sidebar_label: version -title: version ---- - -#### get\_installed\_requirement\_string - -```python -def get_installed_requirement_string(package: str = DLT_PKG_NAME) -> str -``` - -Gets the requirement string of currently installed dlt version - diff --git a/docs/pydoc-markdown.yml b/docs/website/pydoc-markdown.yml similarity index 76% rename from docs/pydoc-markdown.yml rename to docs/website/pydoc-markdown.yml index 60c6af0509..570f00bf57 100644 --- a/docs/pydoc-markdown.yml +++ b/docs/website/pydoc-markdown.yml @@ -1,18 +1,19 @@ loaders: - type: python - search_path: [../dlt] + search_path: [../../dlt] processors: - type: filter skip_empty_modules: true - - type: smart + - type: docs.website.pydoc_markdown_dlt.DltProcessor - type: crossref renderer: type: docusaurus - docs_base_path: website/docs + docs_base_path: docs relative_output_path: api_reference relative_sidebar_path: sidebar.json sidebar_top_level_label: dlt markdown: + use_fixed_header_levels: false escape_html_in_docstring: false classdef_with_decorators: true signature_with_decorators: true diff --git a/docs/website/pydoc_markdown_dlt.py b/docs/website/pydoc_markdown_dlt.py new file mode 100644 index 0000000000..ff970ef3a2 --- /dev/null +++ b/docs/website/pydoc_markdown_dlt.py @@ -0,0 +1,25 @@ +from pydoc_markdown.contrib.processors.smart import SmartProcessor +import re +from functools import partial + +sub = partial(re.sub, flags=re.M) + + +class DltProcessor(SmartProcessor): + def _process(self, node): + if not getattr(node, "docstring", None): + return + + # join long lines ending in escape (\) + c = sub(r"\\\n\s*", "", node.docstring.content) + # remove markdown headers + c = sub(r"^#### (.*?)$", r"\1", c) + # convert REPL code blocks to code + c = sub(r"^(\s*>>>|\.\.\.)(.*?)$", r"```\n\1\2\n```", c) + c = sub(r"^(\s*>>>|\.\.\.)(.*?)\n```\n```\n(\s*>>>|\.\.\.)", r"\1\2\n\3", c) + c = sub(r"^(\s*>>>|\.\.\.)(.*?)\n```\n```\n(\s*>>>|\.\.\.)", r"\1\2\n\3", c) + c = sub(r"^(\s*```)(\n\s*>>>) ", r"\1py\2", c) + c = sub(r"(\n\s*)(>>> ?)", r"\1", c) + node.docstring.content = c + + return super()._process(node) \ No newline at end of file