From df576c3d34dc325717fe703eadeef5d9039c2912 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Fri, 3 Nov 2023 20:47:58 +0100 Subject: [PATCH 1/3] documents create_runner --- dlt/helpers/dbt/runner.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/dlt/helpers/dbt/runner.py b/dlt/helpers/dbt/runner.py index 2e857b2256..381260536c 100644 --- a/dlt/helpers/dbt/runner.py +++ b/dlt/helpers/dbt/runner.py @@ -271,5 +271,31 @@ def create_runner( package_profile_name: str = None, auto_full_refresh_when_out_of_sync: bool = None, config: DBTRunnerConfiguration = None - ) -> DBTPackageRunner: - return DBTPackageRunner(venv, credentials, working_dir, credentials.dataset_name, config) +) -> DBTPackageRunner: + """Creates a Python wrapper over `dbt` package present at specified location, that allows to control it (ie. run and test) from Python code. + + The created wrapper minimizes the required effort to run `dbt` packages. It clones the package repo and keeps it up to data, + optionally shares the `dlt` destination credentials with `dbt` and allows the isolated execution with `venv` parameter. + + Note that you can pass config and secrets in DBTRunnerConfiguration as configuration in section "dbt_package_runner" + + Args: + venv (Venv): A virtual environment with required dbt dependencies. Pass None to use current environment. + credentials (DestinationClientDwhConfiguration): Any configuration deriving from DestinationClientDwhConfiguration ie. ConnectionStringCredentials + working_dir (str): A working dir to which the package will be cloned + package_location (str): A git repository url to be cloned or a local path where dbt package is present + package_repository_branch (str, optional): A branch name, tag name or commit-id to check out. Defaults to None. + package_repository_ssh_key (TSecretValue, optional): SSH key to be used to clone private repositories. Defaults to TSecretValue(""). + package_profiles_dir (str, optional): Path to the folder where "profiles.yml" resides + package_profile_name (str, optional): Name of the profile in "profiles.yml" + auto_full_refresh_when_out_of_sync (bool, optional): If set to True (default), the wrapper will automatically fall back to full-refresh mode when schema is out of sync + See: https://docs.getdbt.com/docs/build/incremental-models#what-if-the-columns-of-my-incremental-model-change_description_. Defaults to None. + config (DBTRunnerConfiguration, optional): Explicit additional configuration for the runner. + + Returns: + DBTPackageRunner: A Python `dbt` wrapper + """ + dataset_name = credentials.dataset_name if credentials else "" + if venv is None: + venv = Venv.restore_current() + return DBTPackageRunner(venv, credentials, working_dir, dataset_name, config) From 410cf6b6d68e13fd522249de8d2be0c6b4c8d409 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Fri, 3 Nov 2023 20:49:01 +0100 Subject: [PATCH 2/3] documents dbt standalone runner --- docs/examples/chess_production/chess.py | 3 +- docs/website/.gitignore | 1 + .../transformations/dbt/__init__.py | 0 .../transformations/dbt/dbt-snippets.py | 20 +++++++++ .../dlt-ecosystem/transformations/dbt/dbt.md | 44 +++++++++++++++++++ .../transformations/dbt/profiles.yml | 14 ++++++ .../docs/examples/chess_production/index.md | 1 - 7 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 docs/website/docs/dlt-ecosystem/transformations/dbt/__init__.py create mode 100644 docs/website/docs/dlt-ecosystem/transformations/dbt/dbt-snippets.py create mode 100644 docs/website/docs/dlt-ecosystem/transformations/dbt/profiles.yml diff --git a/docs/examples/chess_production/chess.py b/docs/examples/chess_production/chess.py index 0ff5ce7c7f..79b573fe43 100644 --- a/docs/examples/chess_production/chess.py +++ b/docs/examples/chess_production/chess.py @@ -3,7 +3,6 @@ import dlt from dlt.common import sleep -from dlt.common.runtime.slack import send_slack_message from dlt.common.typing import StrAny, TDataItems from dlt.sources.helpers.requests import client @@ -161,4 +160,4 @@ def load_data_with_retry(pipeline, data): ) # get data for a few famous players data = chess(chess_url="https://api.chess.com/pub/", max_players=MAX_PLAYERS) - load_data_with_retry(pipeline, data) + load_data_with_retry(pipeline, data) \ No newline at end of file diff --git a/docs/website/.gitignore b/docs/website/.gitignore index fe0be8784d..3e6e15d0c4 100644 --- a/docs/website/.gitignore +++ b/docs/website/.gitignore @@ -8,6 +8,7 @@ .docusaurus .cache-loader docs/api_reference +jaffle_shop # Misc .DS_Store diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/__init__.py b/docs/website/docs/dlt-ecosystem/transformations/dbt/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt-snippets.py b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt-snippets.py new file mode 100644 index 0000000000..beb1c862cc --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt-snippets.py @@ -0,0 +1,20 @@ +def run_dbt_standalone_snippet() -> None: + # @@@DLT_SNIPPET_START run_dbt_standalone + import os + + from dlt.helpers.dbt import create_runner + + runner = create_runner( + None, # use current virtual env to run dlt + None, # we do not need dataset name and we do not pass any credentials in environment to dlt + working_dir=".", # the package below will be cloned to current dir + package_location="https://github.com/dbt-labs/jaffle_shop.git", + package_profiles_dir=os.path.abspath("."), # profiles.yml must be placed in this dir + package_profile_name="duckdb_dlt_dbt_test" # name of the profile + ) + + models = runner.run_all() + # @@@DLT_SNIPPET_END run_dbt_standalone + + for m in models: + print(f"Model {m.model_name} materialized in {m.time} with status {m.status} and message {m.message}") diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index c2ebd9bc06..af2d5df469 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -55,6 +55,8 @@ pipeline = dlt.pipeline( ) # make or restore venv for dbt, using latest dbt version +# NOTE: if you have dbt installed in your current environment, just skip this line +# and the `venv` argument to dlt.dbt.package() venv = dlt.dbt.get_venv(pipeline) # get runner, optionally pass the venv @@ -78,6 +80,48 @@ for m in models: ) ``` +## How to run dbt runner without pipeline +You can use dbt runner without dlt pipeline. Example below will clone and run **jaffle shop** using a dbt profile that you supply. +It assumes that dbt is installed in the current Python environment and the `profile.yml` is in the same folder as the Python script. + +```py +import os + +from dlt.helpers.dbt import create_runner + +runner = create_runner( + None, # use current virtual env to run dlt + None, # we do not need dataset name and we do not pass any credentials in environment to dlt + working_dir=".", # the package below will be cloned to current dir + package_location="https://github.com/dbt-labs/jaffle_shop.git", + package_profiles_dir=os.path.abspath("."), # profiles.yml must be placed in this dir + package_profile_name="duckdb_dlt_dbt_test" # name of the profile +) + +models = runner.run_all() +``` + + +Here's example **duckdb** profile +```yaml +config: + # do not track usage, do not create .user.yml + send_anonymous_usage_stats: False + +duckdb_dlt_dbt_test: + target: analytics + outputs: + analytics: + type: duckdb + # schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" + path: "duckdb_dlt_dbt_test.duckdb" + extensions: + - httpfs + - parquet +``` +You can run the example with dbt debug log: `RUNTIME__LOG_LEVEL=DEBUG python dbt_standalone.py` + + ## Other transforming tools If you want to transform the data before loading, you can use Python. If you want to transform the diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/profiles.yml b/docs/website/docs/dlt-ecosystem/transformations/dbt/profiles.yml new file mode 100644 index 0000000000..99d66c958f --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/profiles.yml @@ -0,0 +1,14 @@ +config: + # do not track usage, do not create .user.yml + send_anonymous_usage_stats: False + +duckdb_dlt_dbt_test: + target: analytics + outputs: + analytics: + type: duckdb + # schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" + path: "duckdb_dlt_dbt_test.duckdb" + extensions: + - httpfs + - parquet \ No newline at end of file diff --git a/docs/website/docs/examples/chess_production/index.md b/docs/website/docs/examples/chess_production/index.md index f821600c67..c8278f8676 100644 --- a/docs/website/docs/examples/chess_production/index.md +++ b/docs/website/docs/examples/chess_production/index.md @@ -32,7 +32,6 @@ from typing import Any, Iterator import dlt from dlt.common import sleep -from dlt.common.runtime.slack import send_slack_message from dlt.common.typing import StrAny, TDataItems from dlt.sources.helpers.requests import client From 4cb8d3842d612e9e91123e711e4101adf692345c Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Wed, 8 Nov 2023 13:22:54 +0100 Subject: [PATCH 3/3] adds dbt deps to docs --- .../dlt-ecosystem/destinations/snowflake.md | 13 ++++++- poetry.lock | 34 +++++++++---------- pyproject.toml | 2 ++ 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 16d579ee29..5efc31dde8 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -66,7 +66,7 @@ Now you can use the user named `LOADER` to access database `DLT_DATA` and log in You can also decrease the suspend time for your warehouse to 1 minute (**Admin**/**Warehouses** in Snowflake UI) ### Authentication types -Snowflake destination accepts two authentication type +Snowflake destination accepts three authentication types - password authentication - [key pair authentication](https://docs.snowflake.com/en/user-guide/key-pair-auth) @@ -95,6 +95,17 @@ If you pass a passphrase in the connection string, please url encode it. destination.snowflake.credentials="snowflake://loader:@kgiotue-wn98412/dlt_data?private_key=&private_key_passphrase=" ``` +In **external authentication** you can use oauth provider like Okta or external browser to authenticate. You pass your authenticator and refresh token as below: +```toml +[destination.snowflake.credentials] +database = "dlt_data" +username = "loader" +authenticator="..." +token="..." +``` +or in connection string as query parameters. +Refer to Snowflake [OAuth](https://docs.snowflake.com/en/user-guide/oauth-intro) for more details. + ## Write disposition All write dispositions are supported diff --git a/poetry.lock b/poetry.lock index 15c257b607..159b81f9fa 100644 --- a/poetry.lock +++ b/poetry.lock @@ -30,7 +30,7 @@ name = "agate" version = "1.6.3" description = "A data analysis library that is optimized for humans instead of machines." category = "main" -optional = true +optional = false python-versions = "*" [package.dependencies] @@ -1428,7 +1428,7 @@ name = "dbt-core" version = "1.5.6" description = "With dbt, data analysts and engineers can build analytics the way engineers build applications." category = "main" -optional = true +optional = false python-versions = ">=3.7.2" [package.dependencies] @@ -1460,7 +1460,7 @@ name = "dbt-duckdb" version = "1.5.2" description = "The duckdb adapter plugin for dbt (data build tool)" category = "main" -optional = true +optional = false python-versions = "*" [package.dependencies] @@ -1475,7 +1475,7 @@ name = "dbt-extractor" version = "0.4.1" description = "A tool to analyze and extract information from Jinja used in dbt projects." category = "main" -optional = true +optional = false python-versions = ">=3.6.1" [[package]] @@ -1661,7 +1661,7 @@ name = "duckdb" version = "0.9.1" description = "DuckDB embedded database" category = "main" -optional = true +optional = false python-versions = ">=3.7.0" [[package]] @@ -2012,7 +2012,7 @@ name = "future" version = "0.18.3" description = "Clean single-source support for Python 3 and 2" category = "main" -optional = true +optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" [[package]] @@ -2406,7 +2406,7 @@ name = "hologram" version = "0.0.16" description = "JSON schema generation from dataclasses" category = "main" -optional = true +optional = false python-versions = "*" [package.dependencies] @@ -2533,7 +2533,7 @@ name = "isodate" version = "0.6.1" description = "An ISO 8601 date/time/duration parser and formatter" category = "main" -optional = true +optional = false python-versions = "*" [package.dependencies] @@ -2686,7 +2686,7 @@ name = "leather" version = "0.3.4" description = "Python charting for 80% of humans." category = "main" -optional = true +optional = false python-versions = "*" [package.dependencies] @@ -2748,7 +2748,7 @@ name = "logbook" version = "1.5.3" description = "A logging replacement for Python" category = "main" -optional = true +optional = false python-versions = "*" [package.extras] @@ -2900,7 +2900,7 @@ name = "mashumaro" version = "3.6" description = "Fast serialization library on top of dataclasses" category = "main" -optional = true +optional = false python-versions = ">=3.7" [package.dependencies] @@ -2950,7 +2950,7 @@ name = "minimal-snowplow-tracker" version = "0.0.2" description = "A minimal snowplow event tracker for Python. Add analytics to your Python and Django apps, webapps and games" category = "main" -optional = true +optional = false python-versions = "*" [package.dependencies] @@ -3001,7 +3001,7 @@ name = "msgpack" version = "1.0.5" description = "MessagePack serializer" category = "main" -optional = true +optional = false python-versions = "*" [[package]] @@ -3099,7 +3099,7 @@ name = "networkx" version = "2.8.8" description = "Python package for creating and manipulating graphs and networks" category = "main" -optional = true +optional = false python-versions = ">=3.8" [package.extras] @@ -3324,7 +3324,7 @@ name = "parsedatetime" version = "2.4" description = "Parse human-readable date/time text." category = "main" -optional = true +optional = false python-versions = "*" [package.dependencies] @@ -3875,7 +3875,7 @@ name = "pytimeparse" version = "1.1.8" description = "Time expression parser" category = "main" -optional = true +optional = false python-versions = "*" [[package]] @@ -4861,7 +4861,7 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "1.1" python-versions = ">=3.8.1,<4.0" -content-hash = "7d5b9bfb96bfd08e2b6843df885a3ff605abe603250db78e35350e18bc933a64" +content-hash = "8afa9a4b0b11ff48506bd5424b6e64a181ff52955dcce41e1bc43c63738c4062" [metadata.files] about-time = [ diff --git a/pyproject.toml b/pyproject.toml index 3795f0096b..5c159fd8d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -160,6 +160,8 @@ pymysql = "^1.1.0" pypdf2 = "^3.0.1" pydoc-markdown = "^4.8.2" connectorx="0.3.1" +dbt-core=">=1.2.0" +dbt-duckdb=">=1.2.0" [build-system] requires = ["poetry-core>=1.0.8"]