From 58f8ad104924da01004388f54a1aee808ed2cd7a Mon Sep 17 00:00:00 2001 From: David Scharf Date: Wed, 4 Oct 2023 11:05:18 +0200 Subject: [PATCH] examples for docs (#616) * remove snipsync (#613) * add custom snippets element * remove snipsync * migrate performance snippets * add missing init files * refine snippet watcher * docs examples * fixes chess example * fixes DLT -> dlt * more work on transformers example * make header smaller * example for zendesk incremental loading * move incremental loading example to right location * added text and output example to incremental zendesk * allow secrets files in examples * add zendesk credentials * correct text and code snippets for zendesk example * add main clause * add config example * pytest marker to skip tests running in PRs from forks on github * removes more typings and adds comments to zendesk example * shortens example titles --------- Co-authored-by: Marcin Rudolf Co-authored-by: AstrakhantsevaAA --- .github/workflows/test_doc_snippets.yml | 3 + .gitignore | 1 + Makefile | 2 + dlt/common/schema/schema.py | 2 +- dlt/common/storages/exceptions.py | 2 +- dlt/helpers/airflow_helper.py | 2 +- dlt/helpers/streamlit_helper.py | 2 +- docs/examples/.dlt/secrets.toml | 26 +++ .../{ => archive}/.dlt/example.secrets.toml | 0 docs/examples/{ => archive}/README.md | 0 docs/examples/{ => archive}/__init__.py | 0 docs/examples/{ => archive}/_helpers.py | 0 .../credentials/.dlt/config.toml | 0 .../{ => archive}/credentials/__init__.py | 0 .../{ => archive}/credentials/explicit.py | 0 .../examples/{ => archive}/data/channels.json | 0 .../{ => archive}/data/demo_example.json | 0 .../examples/{ => archive}/data/messages.json | 0 .../2888158124550630_tracker.jsonl | 0 ...-0c26-4f0b-927b-14d48db43c28_tracker.jsonl | 0 .../data/singer_taps/csv_catalog.json | 0 .../data/singer_taps/model_annotations.csv | 0 .../data/singer_taps/tap_google_sheet.jsonl | 0 .../data/singer_taps/tap_hubspot.jsonl | 0 docs/examples/{ => archive}/dbt_run_jaffle.py | 0 .../{ => archive}/discord_iterator.py | 0 .../schemas/dlt_quickstart.schema.yaml | 0 .../{ => archive}/google_drive_csv.py | 0 docs/examples/{ => archive}/google_sheets.py | 0 docs/examples/{ => archive}/quickstart.py | 0 docs/examples/{ => archive}/rasa_example.py | 6 +- docs/examples/{ => archive}/read_table.py | 0 .../{ => archive}/restore_pipeline.py | 0 .../{ => archive}/schemas/__init__.py | 0 .../{ => archive}/schemas/discord.schema.yml | 0 .../schemas/dlt_quickstart.schema.yaml | 0 .../{ => archive}/schemas/hubspot.schema.yaml | 0 .../schemas/inferred_demo.schema.yml | 0 .../{ => archive}/singer_tap_example.py | 0 .../{ => archive}/singer_tap_jsonl_example.py | 0 .../{ => archive}/sources/__init__.py | 0 .../{ => archive}/sources/google_sheets.py | 0 docs/examples/{ => archive}/sources/jsonl.py | 0 .../{ => archive}/sources/rasa/__init__.py | 0 .../{ => archive}/sources/rasa/rasa.py | 0 .../sources/rasa/rasa.schema.yaml | 0 .../{ => archive}/sources/singer_tap.py | 2 +- .../{ => archive}/sources/sql_query.py | 0 docs/examples/{ => archive}/sources/stdout.py | 0 .../{ => archive}/sync_schema_example.py | 0 .../incremental_loading/.dlt/config.toml | 0 .../incremental_loading/.dlt/secrets.toml | 4 + docs/examples/incremental_loading/__init__.py | 0 docs/examples/incremental_loading/zendesk.py | 126 +++++++++++++++ docs/examples/transformers/.dlt/config.toml | 16 ++ docs/examples/transformers/__init__.py | 0 docs/examples/transformers/pokemon.py | 61 ++++++++ .../transformers_and_parallelism/__init__.py | 0 docs/technical/create_pipeline.md | 4 +- .../docs/dlt-ecosystem/destinations/athena.md | 2 +- .../dlt-ecosystem/destinations/bigquery.md | 2 +- .../dlt-ecosystem/destinations/redshift.md | 2 +- .../dlt-ecosystem/destinations/snowflake.md | 2 +- docs/website/docs/dlt-ecosystem/staging.md | 2 +- .../dlt-ecosystem/verified-sources/github.md | 2 +- .../verified-sources/google_sheets.md | 2 +- .../dlt-ecosystem/verified-sources/stripe.md | 2 +- .../verified-sources/workable.md | 2 +- .../website/docs/examples/_examples-header.md | 21 +++ .../examples/incremental_loading/__init__.py | 0 .../incremental_loading/code/.dlt/config.toml | 2 + .../code/.dlt/secrets.toml | 6 + .../incremental_loading/code/__init__.py | 0 .../code/zendesk-snippets.py | 148 ++++++++++++++++++ .../examples/incremental_loading/index.md | 126 +++++++++++++++ .../docs/examples/transformers/__init__.py | 0 .../transformers/code/.dlt/config.toml | 18 +++ .../examples/transformers/code/__init__.py | 0 .../transformers/code/pokemon-snippets.py | 73 +++++++++ .../docs/examples/transformers/index.md | 116 ++++++++++++++ docs/website/docs/general-usage/resource.md | 4 +- docs/website/docs/general-usage/schema.md | 2 +- docs/website/docs/general-usage/state.md | 2 +- docs/website/docs/getting-started.md | 6 +- docs/website/docs/reference/performance.md | 1 - docs/website/docusaurus.config.js | 6 - docs/website/sidebars.js | 15 ++ docs/website/src/css/custom.css | 62 ++++---- docs/website/src/theme/DLTExampleHeader.tsx | 15 ++ docs/website/src/theme/MDXComponents.js | 6 +- ...-active-1.svg => Howdltworks-Active-1.svg} | 0 docs/website/tools/update_snippets.js | 92 +++++++++-- mypy.ini | 4 +- .../configuration/test_configuration.py | 2 +- tests/utils.py | 12 ++ 95 files changed, 935 insertions(+), 81 deletions(-) create mode 100644 docs/examples/.dlt/secrets.toml rename docs/examples/{ => archive}/.dlt/example.secrets.toml (100%) rename docs/examples/{ => archive}/README.md (100%) rename docs/examples/{ => archive}/__init__.py (100%) rename docs/examples/{ => archive}/_helpers.py (100%) rename docs/examples/{ => archive}/credentials/.dlt/config.toml (100%) rename docs/examples/{ => archive}/credentials/__init__.py (100%) rename docs/examples/{ => archive}/credentials/explicit.py (100%) rename docs/examples/{ => archive}/data/channels.json (100%) rename docs/examples/{ => archive}/data/demo_example.json (100%) rename docs/examples/{ => archive}/data/messages.json (100%) rename docs/examples/{ => archive}/data/rasa_trackers/2888158124550630_tracker.jsonl (100%) rename docs/examples/{ => archive}/data/rasa_trackers/8629c904-0c26-4f0b-927b-14d48db43c28_tracker.jsonl (100%) rename docs/examples/{ => archive}/data/singer_taps/csv_catalog.json (100%) rename docs/examples/{ => archive}/data/singer_taps/model_annotations.csv (100%) rename docs/examples/{ => archive}/data/singer_taps/tap_google_sheet.jsonl (100%) rename docs/examples/{ => archive}/data/singer_taps/tap_hubspot.jsonl (100%) rename docs/examples/{ => archive}/dbt_run_jaffle.py (100%) rename docs/examples/{ => archive}/discord_iterator.py (100%) rename docs/examples/{ => archive}/examples/schemas/dlt_quickstart.schema.yaml (100%) rename docs/examples/{ => archive}/google_drive_csv.py (100%) rename docs/examples/{ => archive}/google_sheets.py (100%) rename docs/examples/{ => archive}/quickstart.py (100%) rename docs/examples/{ => archive}/rasa_example.py (88%) rename docs/examples/{ => archive}/read_table.py (100%) rename docs/examples/{ => archive}/restore_pipeline.py (100%) rename docs/examples/{ => archive}/schemas/__init__.py (100%) rename docs/examples/{ => archive}/schemas/discord.schema.yml (100%) rename docs/examples/{ => archive}/schemas/dlt_quickstart.schema.yaml (100%) rename docs/examples/{ => archive}/schemas/hubspot.schema.yaml (100%) rename docs/examples/{ => archive}/schemas/inferred_demo.schema.yml (100%) rename docs/examples/{ => archive}/singer_tap_example.py (100%) rename docs/examples/{ => archive}/singer_tap_jsonl_example.py (100%) rename docs/examples/{ => archive}/sources/__init__.py (100%) rename docs/examples/{ => archive}/sources/google_sheets.py (100%) rename docs/examples/{ => archive}/sources/jsonl.py (100%) rename docs/examples/{ => archive}/sources/rasa/__init__.py (100%) rename docs/examples/{ => archive}/sources/rasa/rasa.py (100%) rename docs/examples/{ => archive}/sources/rasa/rasa.schema.yaml (100%) rename docs/examples/{ => archive}/sources/singer_tap.py (97%) rename docs/examples/{ => archive}/sources/sql_query.py (100%) rename docs/examples/{ => archive}/sources/stdout.py (100%) rename docs/examples/{ => archive}/sync_schema_example.py (100%) create mode 100644 docs/examples/incremental_loading/.dlt/config.toml create mode 100644 docs/examples/incremental_loading/.dlt/secrets.toml create mode 100644 docs/examples/incremental_loading/__init__.py create mode 100644 docs/examples/incremental_loading/zendesk.py create mode 100644 docs/examples/transformers/.dlt/config.toml create mode 100644 docs/examples/transformers/__init__.py create mode 100644 docs/examples/transformers/pokemon.py create mode 100644 docs/examples/transformers_and_parallelism/__init__.py create mode 100644 docs/website/docs/examples/_examples-header.md create mode 100644 docs/website/docs/examples/incremental_loading/__init__.py create mode 100644 docs/website/docs/examples/incremental_loading/code/.dlt/config.toml create mode 100644 docs/website/docs/examples/incremental_loading/code/.dlt/secrets.toml create mode 100644 docs/website/docs/examples/incremental_loading/code/__init__.py create mode 100644 docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py create mode 100644 docs/website/docs/examples/incremental_loading/index.md create mode 100644 docs/website/docs/examples/transformers/__init__.py create mode 100644 docs/website/docs/examples/transformers/code/.dlt/config.toml create mode 100644 docs/website/docs/examples/transformers/code/__init__.py create mode 100644 docs/website/docs/examples/transformers/code/pokemon-snippets.py create mode 100644 docs/website/docs/examples/transformers/index.md create mode 100644 docs/website/src/theme/DLTExampleHeader.tsx rename docs/website/static/img/{Howdltworks-active-1.svg => Howdltworks-Active-1.svg} (100%) diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index d5091e0fdb..b2a2f241db 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -18,6 +18,9 @@ env: DESTINATION__WEAVIATE__VECTORIZER: text2vec-contextionary DESTINATION__WEAVIATE__MODULE_CONFIG: "{\"text2vec-contextionary\": {\"vectorizeClassName\": false, \"vectorizePropertyName\": true}}" + # zendesk vars for example + SOURCES__ZENDESK__CREDENTIALS: ${{ secrets.ZENDESK__CREDENTIALS }} + jobs: run_lint: diff --git a/.gitignore b/.gitignore index 4a52ad15e8..3604d9b1bf 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ experiments/* # !experiments/pipeline/ # !experiments/pipeline/* secrets.toml +!docs/**/secrets.toml *.session.sql *.duckdb *.wal diff --git a/Makefile b/Makefile index 8a7f14a5db..bd522c9ba3 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,8 @@ help: @echo " tests all components unsing local destinations: duckdb and postgres" @echo " test-common" @echo " tests common components" + @echo " test-and-lint-snippets" + @echo " tests and lints snippets and examples in docs" @echo " build-library" @echo " makes dev and then builds dlt package for distribution" @echo " publish-library" diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 2889d776c5..df1f2dc491 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -420,7 +420,7 @@ def _coerce_non_null_value(self, table_columns: TTableSchemaColumns, table_name: raise CannotCoerceColumnException(table_name, col_name, py_type, table_columns[col_name]["data_type"], v) # otherwise we must create variant extension to the table # pass final=True so no more auto-variants can be created recursively - # TODO: generate callback so DLT user can decide what to do + # TODO: generate callback so dlt user can decide what to do variant_col_name = self.naming.shorten_fragments(col_name, VARIANT_FIELD_FORMAT % py_type) return self._coerce_non_null_value(table_columns, table_name, variant_col_name, v, is_variant=True) diff --git a/dlt/common/storages/exceptions.py b/dlt/common/storages/exceptions.py index cab149c22c..3203191cd8 100644 --- a/dlt/common/storages/exceptions.py +++ b/dlt/common/storages/exceptions.py @@ -51,7 +51,7 @@ class SchemaStorageException(StorageException): class InStorageSchemaModified(SchemaStorageException): def __init__(self, schema_name: str, storage_path: str) -> None: - msg = f"Schema {schema_name} in {storage_path} was externally modified. This is not allowed as that would prevent correct version tracking. Use import/export capabilities of DLT to provide external changes." + msg = f"Schema {schema_name} in {storage_path} was externally modified. This is not allowed as that would prevent correct version tracking. Use import/export capabilities of dlt to provide external changes." super().__init__(msg) diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 3cdda43f8b..2a9c76cc76 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -38,7 +38,7 @@ class PipelineTasksGroup(TaskGroup): """ - Represents a DLT Airflow pipeline task group. + Represents a dlt Airflow pipeline task group. """ def __init__( diff --git a/dlt/helpers/streamlit_helper.py b/dlt/helpers/streamlit_helper.py index 52584996cf..a8881563fb 100644 --- a/dlt/helpers/streamlit_helper.py +++ b/dlt/helpers/streamlit_helper.py @@ -208,7 +208,7 @@ def write_data_explorer_page(pipeline: Pipeline, schema_name: str = None, show_d #### Args: pipeline (Pipeline): Pipeline instance to use. schema_name (str, optional): Name of the schema to display. If None, default schema is used. - show_dlt_tables (bool, optional): Should show DLT internal tables. Defaults to False. + show_dlt_tables (bool, optional): Should show dlt internal tables. Defaults to False. example_query (str, optional): Example query to be displayed in the SQL Query box. show_charts (bool, optional): Should automatically show charts for the queries from SQL Query box. Defaults to True. diff --git a/docs/examples/.dlt/secrets.toml b/docs/examples/.dlt/secrets.toml new file mode 100644 index 0000000000..6593439cc0 --- /dev/null +++ b/docs/examples/.dlt/secrets.toml @@ -0,0 +1,26 @@ +# here is a file with the secrets for all the example pipelines in `examples` folder + +[sources] +# redshift password for query tables example +query_table.credentials.password="8P5gyDPNo9zo582rQG6a" +query_sql.credentials.password="8P5gyDPNo9zo582rQG6a" + +# google sheets example +[sources.google_spreadsheet.credentials] +project_id="chat-analytics-317513" +client_email="loader@chat-analytics-317513.iam.gserviceaccount.com" +private_key="-----BEGIN PRIVATE KEY-----\nMIIEuwIBADANBgkqhkiG9w0BAQEFAASCBKUwggShAgEAAoIBAQCNEN0bL39HmD+S\n7inCg8CdRKEMZ/q7Rv5uUiTyUMjQLNXySOPRSSJBSXBPpLJPbcmfxCYgOPWadA3F\noa54WJFR3Uxd+SjAC848dGz5+JEL5u2rHcjzL1IbjDd5oH9rap/QxYm/R9Q5eSdC\nlGiiFh4zH+U9nhWWUovn+ofixbQkhrMFOfgHt+Jvdh/2m7Sdz47itjWFC258R1Ki\nH9vPVtHxw0LrcUZN7HACV3NICRUkvX8U2+JC25esmPxc/qqmxoFlQ7ono/NeMjQa\nq2TyTyNSh4pDtf30PAW4cU2AUbtaSPhIdRloDuwzW9d88VUUbVelqHHeJRgGjkSX\nQz2MCBuFAgMBAAECgf8zlepWYEgm8xtdim2ydB3mdhR/zoZmZM/Q1NthJ8u/IbdO\nb2HPEXxGbDKIIJzhAA17Un98leBLwYKuLZhOpdB+igyJlTG8XlCRF88XiUosJWR3\niHmiuMkndHA7WyTXDc0n3GpUFYWkGGng2cKLx7V/OFmpMwhC9LEKMNOrBKnf9U6Z\n/9nanIerFZU4m5mWbNW/ZRc+qvd+1zGw/JYM6ntdkKLo/TwNOmOS5FS01yLvx7Xw\nm12f9I3VceGXWyrYEh+UCWk0gsEb8xnGGZKy3op5I6trsXzH8I3HCXvapkeWSaFe\n/gmT3CHZIK9hang6f4yMG+niuNtZE2/METgvcjkCgYEAwTg1SZAYSaL6LoFV92Kq\nyHV0DP8KivDIKrByOym9oikPK/2ZMNi9NivVmSALuR54wj7pFxFmyEj6UTklSeLb\nRvOjcPnZEMbFspRHIzkySfsnfScoHZXOeakjOub1K5FehYsLXQIfe7iwRg/mcd/2\noFVyJrm2aNXcvNuug4scEE0CgYEAuuaRmGY5mKv+viuZ/zzOky7IjDnp4w2BMJt0\noMXznKuLHJpexnQ9A/ZHxpAp6Bi6Glk0XLi2uaI+ggXlEUfNa3DHMQu7xg1RaCqN\n957WGRO0ETtIWdp1BHhWPtT5kdOrjSZy9vRSZ0vh2WnZe5SgKRVCqQsV7ExcEltz\nUc9WlBkCgYA9MaQOzEgk6iz6FZQ4aVNVcX1zsEKShneerYtAGZQpi392mzatNbeX\nNILNoEyWMIRmYK5J1AUNYa+FkeexYtu3uOoGmdqZaZqrWDK/gRngPF7hUElwNUXT\nWjICMatsRPn+qW7L4iQ+dtu9FMQTRK9DUEx6305aHYFvftPibWhR8QKBgQCAd3GG\nNmXKihaMsr2kUjCPvG1+7WPVfHfbaE9PHyFnBAaXv4f7kvRJn+QQGRGlBjINYFl8\njj6S9HFQwCqGqTsKabeQ/8auyIK3PeDdXqE9FW0FFyGRGXarfueRQqTU1pCpcc89\n7gwiEmeIIJiruCoqcwGh3gvQo1/6AkAO8JxLKQKBgF0T8P0hRctXFejcFf/4EikS\n2+WA/gNSQITC1m+8nWNnU+bDmRax+pIkzlvjkG5kyNfWvB7i2A5Y5OnCo92y5aDQ\nzbGHLwZj0HXqLFXhbAv/0xZPXlZ71NWpi2BpCJRnzU65ftsjePfydfvN6g4mPQ28\nkHQsYKUZk5HPC8FlPvQe\n-----END PRIVATE KEY-----\n" + +[destination] +# all postgres destinations for all examples +postgres.credentials = "postgres://loader:loader@localhost:5432/dlt_data" +# all redshift destinations for all examples +redshift.credentials = "postgres://loader:8P5gyDPNo9zo582rQG6a@chat-analytics.czwteevq7bpe.eu-central-1.redshift.amazonaws.com:5439/chat_analytics_rasa" + +# all the bigquery destinations +[destination.bigquery.credentials] +project_id="chat-analytics-317513" +client_email="loader@chat-analytics-317513.iam.gserviceaccount.com" +private_key="-----BEGIN PRIVATE KEY-----\nMIIEuwIBADANBgkqhkiG9w0BAQEFAASCBKUwggShAgEAAoIBAQCNEN0bL39HmD+S\n7inCg8CdRKEMZ/q7Rv5uUiTyUMjQLNXySOPRSSJBSXBPpLJPbcmfxCYgOPWadA3F\noa54WJFR3Uxd+SjAC848dGz5+JEL5u2rHcjzL1IbjDd5oH9rap/QxYm/R9Q5eSdC\nlGiiFh4zH+U9nhWWUovn+ofixbQkhrMFOfgHt+Jvdh/2m7Sdz47itjWFC258R1Ki\nH9vPVtHxw0LrcUZN7HACV3NICRUkvX8U2+JC25esmPxc/qqmxoFlQ7ono/NeMjQa\nq2TyTyNSh4pDtf30PAW4cU2AUbtaSPhIdRloDuwzW9d88VUUbVelqHHeJRgGjkSX\nQz2MCBuFAgMBAAECgf8zlepWYEgm8xtdim2ydB3mdhR/zoZmZM/Q1NthJ8u/IbdO\nb2HPEXxGbDKIIJzhAA17Un98leBLwYKuLZhOpdB+igyJlTG8XlCRF88XiUosJWR3\niHmiuMkndHA7WyTXDc0n3GpUFYWkGGng2cKLx7V/OFmpMwhC9LEKMNOrBKnf9U6Z\n/9nanIerFZU4m5mWbNW/ZRc+qvd+1zGw/JYM6ntdkKLo/TwNOmOS5FS01yLvx7Xw\nm12f9I3VceGXWyrYEh+UCWk0gsEb8xnGGZKy3op5I6trsXzH8I3HCXvapkeWSaFe\n/gmT3CHZIK9hang6f4yMG+niuNtZE2/METgvcjkCgYEAwTg1SZAYSaL6LoFV92Kq\nyHV0DP8KivDIKrByOym9oikPK/2ZMNi9NivVmSALuR54wj7pFxFmyEj6UTklSeLb\nRvOjcPnZEMbFspRHIzkySfsnfScoHZXOeakjOub1K5FehYsLXQIfe7iwRg/mcd/2\noFVyJrm2aNXcvNuug4scEE0CgYEAuuaRmGY5mKv+viuZ/zzOky7IjDnp4w2BMJt0\noMXznKuLHJpexnQ9A/ZHxpAp6Bi6Glk0XLi2uaI+ggXlEUfNa3DHMQu7xg1RaCqN\n957WGRO0ETtIWdp1BHhWPtT5kdOrjSZy9vRSZ0vh2WnZe5SgKRVCqQsV7ExcEltz\nUc9WlBkCgYA9MaQOzEgk6iz6FZQ4aVNVcX1zsEKShneerYtAGZQpi392mzatNbeX\nNILNoEyWMIRmYK5J1AUNYa+FkeexYtu3uOoGmdqZaZqrWDK/gRngPF7hUElwNUXT\nWjICMatsRPn+qW7L4iQ+dtu9FMQTRK9DUEx6305aHYFvftPibWhR8QKBgQCAd3GG\nNmXKihaMsr2kUjCPvG1+7WPVfHfbaE9PHyFnBAaXv4f7kvRJn+QQGRGlBjINYFl8\njj6S9HFQwCqGqTsKabeQ/8auyIK3PeDdXqE9FW0FFyGRGXarfueRQqTU1pCpcc89\n7gwiEmeIIJiruCoqcwGh3gvQo1/6AkAO8JxLKQKBgF0T8P0hRctXFejcFf/4EikS\n2+WA/gNSQITC1m+8nWNnU+bDmRax+pIkzlvjkG5kyNfWvB7i2A5Y5OnCo92y5aDQ\nzbGHLwZj0HXqLFXhbAv/0xZPXlZ71NWpi2BpCJRnzU65ftsjePfydfvN6g4mPQ28\nkHQsYKUZk5HPC8FlPvQe\n-----END PRIVATE KEY-----\n" + + diff --git a/docs/examples/.dlt/example.secrets.toml b/docs/examples/archive/.dlt/example.secrets.toml similarity index 100% rename from docs/examples/.dlt/example.secrets.toml rename to docs/examples/archive/.dlt/example.secrets.toml diff --git a/docs/examples/README.md b/docs/examples/archive/README.md similarity index 100% rename from docs/examples/README.md rename to docs/examples/archive/README.md diff --git a/docs/examples/__init__.py b/docs/examples/archive/__init__.py similarity index 100% rename from docs/examples/__init__.py rename to docs/examples/archive/__init__.py diff --git a/docs/examples/_helpers.py b/docs/examples/archive/_helpers.py similarity index 100% rename from docs/examples/_helpers.py rename to docs/examples/archive/_helpers.py diff --git a/docs/examples/credentials/.dlt/config.toml b/docs/examples/archive/credentials/.dlt/config.toml similarity index 100% rename from docs/examples/credentials/.dlt/config.toml rename to docs/examples/archive/credentials/.dlt/config.toml diff --git a/docs/examples/credentials/__init__.py b/docs/examples/archive/credentials/__init__.py similarity index 100% rename from docs/examples/credentials/__init__.py rename to docs/examples/archive/credentials/__init__.py diff --git a/docs/examples/credentials/explicit.py b/docs/examples/archive/credentials/explicit.py similarity index 100% rename from docs/examples/credentials/explicit.py rename to docs/examples/archive/credentials/explicit.py diff --git a/docs/examples/data/channels.json b/docs/examples/archive/data/channels.json similarity index 100% rename from docs/examples/data/channels.json rename to docs/examples/archive/data/channels.json diff --git a/docs/examples/data/demo_example.json b/docs/examples/archive/data/demo_example.json similarity index 100% rename from docs/examples/data/demo_example.json rename to docs/examples/archive/data/demo_example.json diff --git a/docs/examples/data/messages.json b/docs/examples/archive/data/messages.json similarity index 100% rename from docs/examples/data/messages.json rename to docs/examples/archive/data/messages.json diff --git a/docs/examples/data/rasa_trackers/2888158124550630_tracker.jsonl b/docs/examples/archive/data/rasa_trackers/2888158124550630_tracker.jsonl similarity index 100% rename from docs/examples/data/rasa_trackers/2888158124550630_tracker.jsonl rename to docs/examples/archive/data/rasa_trackers/2888158124550630_tracker.jsonl diff --git a/docs/examples/data/rasa_trackers/8629c904-0c26-4f0b-927b-14d48db43c28_tracker.jsonl b/docs/examples/archive/data/rasa_trackers/8629c904-0c26-4f0b-927b-14d48db43c28_tracker.jsonl similarity index 100% rename from docs/examples/data/rasa_trackers/8629c904-0c26-4f0b-927b-14d48db43c28_tracker.jsonl rename to docs/examples/archive/data/rasa_trackers/8629c904-0c26-4f0b-927b-14d48db43c28_tracker.jsonl diff --git a/docs/examples/data/singer_taps/csv_catalog.json b/docs/examples/archive/data/singer_taps/csv_catalog.json similarity index 100% rename from docs/examples/data/singer_taps/csv_catalog.json rename to docs/examples/archive/data/singer_taps/csv_catalog.json diff --git a/docs/examples/data/singer_taps/model_annotations.csv b/docs/examples/archive/data/singer_taps/model_annotations.csv similarity index 100% rename from docs/examples/data/singer_taps/model_annotations.csv rename to docs/examples/archive/data/singer_taps/model_annotations.csv diff --git a/docs/examples/data/singer_taps/tap_google_sheet.jsonl b/docs/examples/archive/data/singer_taps/tap_google_sheet.jsonl similarity index 100% rename from docs/examples/data/singer_taps/tap_google_sheet.jsonl rename to docs/examples/archive/data/singer_taps/tap_google_sheet.jsonl diff --git a/docs/examples/data/singer_taps/tap_hubspot.jsonl b/docs/examples/archive/data/singer_taps/tap_hubspot.jsonl similarity index 100% rename from docs/examples/data/singer_taps/tap_hubspot.jsonl rename to docs/examples/archive/data/singer_taps/tap_hubspot.jsonl diff --git a/docs/examples/dbt_run_jaffle.py b/docs/examples/archive/dbt_run_jaffle.py similarity index 100% rename from docs/examples/dbt_run_jaffle.py rename to docs/examples/archive/dbt_run_jaffle.py diff --git a/docs/examples/discord_iterator.py b/docs/examples/archive/discord_iterator.py similarity index 100% rename from docs/examples/discord_iterator.py rename to docs/examples/archive/discord_iterator.py diff --git a/docs/examples/examples/schemas/dlt_quickstart.schema.yaml b/docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml similarity index 100% rename from docs/examples/examples/schemas/dlt_quickstart.schema.yaml rename to docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml diff --git a/docs/examples/google_drive_csv.py b/docs/examples/archive/google_drive_csv.py similarity index 100% rename from docs/examples/google_drive_csv.py rename to docs/examples/archive/google_drive_csv.py diff --git a/docs/examples/google_sheets.py b/docs/examples/archive/google_sheets.py similarity index 100% rename from docs/examples/google_sheets.py rename to docs/examples/archive/google_sheets.py diff --git a/docs/examples/quickstart.py b/docs/examples/archive/quickstart.py similarity index 100% rename from docs/examples/quickstart.py rename to docs/examples/archive/quickstart.py diff --git a/docs/examples/rasa_example.py b/docs/examples/archive/rasa_example.py similarity index 88% rename from docs/examples/rasa_example.py rename to docs/examples/archive/rasa_example.py index 3dbd61c692..d438ce5e8b 100644 --- a/docs/examples/rasa_example.py +++ b/docs/examples/archive/rasa_example.py @@ -3,10 +3,10 @@ import dlt from dlt.destinations import bigquery, postgres -from docs.examples.sources.jsonl import jsonl_files -from docs.examples.sources.rasa import rasa +from .sources.jsonl import jsonl_files +from .sources.rasa import rasa -from docs.examples._helpers import pub_bigquery_credentials +from ._helpers import pub_bigquery_credentials # let's load to bigquery, here we provide the credentials for our public project # credentials = pub_bigquery_credentials diff --git a/docs/examples/read_table.py b/docs/examples/archive/read_table.py similarity index 100% rename from docs/examples/read_table.py rename to docs/examples/archive/read_table.py diff --git a/docs/examples/restore_pipeline.py b/docs/examples/archive/restore_pipeline.py similarity index 100% rename from docs/examples/restore_pipeline.py rename to docs/examples/archive/restore_pipeline.py diff --git a/docs/examples/schemas/__init__.py b/docs/examples/archive/schemas/__init__.py similarity index 100% rename from docs/examples/schemas/__init__.py rename to docs/examples/archive/schemas/__init__.py diff --git a/docs/examples/schemas/discord.schema.yml b/docs/examples/archive/schemas/discord.schema.yml similarity index 100% rename from docs/examples/schemas/discord.schema.yml rename to docs/examples/archive/schemas/discord.schema.yml diff --git a/docs/examples/schemas/dlt_quickstart.schema.yaml b/docs/examples/archive/schemas/dlt_quickstart.schema.yaml similarity index 100% rename from docs/examples/schemas/dlt_quickstart.schema.yaml rename to docs/examples/archive/schemas/dlt_quickstart.schema.yaml diff --git a/docs/examples/schemas/hubspot.schema.yaml b/docs/examples/archive/schemas/hubspot.schema.yaml similarity index 100% rename from docs/examples/schemas/hubspot.schema.yaml rename to docs/examples/archive/schemas/hubspot.schema.yaml diff --git a/docs/examples/schemas/inferred_demo.schema.yml b/docs/examples/archive/schemas/inferred_demo.schema.yml similarity index 100% rename from docs/examples/schemas/inferred_demo.schema.yml rename to docs/examples/archive/schemas/inferred_demo.schema.yml diff --git a/docs/examples/singer_tap_example.py b/docs/examples/archive/singer_tap_example.py similarity index 100% rename from docs/examples/singer_tap_example.py rename to docs/examples/archive/singer_tap_example.py diff --git a/docs/examples/singer_tap_jsonl_example.py b/docs/examples/archive/singer_tap_jsonl_example.py similarity index 100% rename from docs/examples/singer_tap_jsonl_example.py rename to docs/examples/archive/singer_tap_jsonl_example.py diff --git a/docs/examples/sources/__init__.py b/docs/examples/archive/sources/__init__.py similarity index 100% rename from docs/examples/sources/__init__.py rename to docs/examples/archive/sources/__init__.py diff --git a/docs/examples/sources/google_sheets.py b/docs/examples/archive/sources/google_sheets.py similarity index 100% rename from docs/examples/sources/google_sheets.py rename to docs/examples/archive/sources/google_sheets.py diff --git a/docs/examples/sources/jsonl.py b/docs/examples/archive/sources/jsonl.py similarity index 100% rename from docs/examples/sources/jsonl.py rename to docs/examples/archive/sources/jsonl.py diff --git a/docs/examples/sources/rasa/__init__.py b/docs/examples/archive/sources/rasa/__init__.py similarity index 100% rename from docs/examples/sources/rasa/__init__.py rename to docs/examples/archive/sources/rasa/__init__.py diff --git a/docs/examples/sources/rasa/rasa.py b/docs/examples/archive/sources/rasa/rasa.py similarity index 100% rename from docs/examples/sources/rasa/rasa.py rename to docs/examples/archive/sources/rasa/rasa.py diff --git a/docs/examples/sources/rasa/rasa.schema.yaml b/docs/examples/archive/sources/rasa/rasa.schema.yaml similarity index 100% rename from docs/examples/sources/rasa/rasa.schema.yaml rename to docs/examples/archive/sources/rasa/rasa.schema.yaml diff --git a/docs/examples/sources/singer_tap.py b/docs/examples/archive/sources/singer_tap.py similarity index 97% rename from docs/examples/sources/singer_tap.py rename to docs/examples/archive/sources/singer_tap.py index 65c9b76e0b..41db2c09f5 100644 --- a/docs/examples/sources/singer_tap.py +++ b/docs/examples/archive/sources/singer_tap.py @@ -95,6 +95,6 @@ def singer_messages() -> Iterator[TDataItem]: os.path.abspath(catalog_file_path), *state_params ) - yield from get_source_from_stream(pipe_iterator, state) # type: ignore + yield from get_source_from_stream(pipe_iterator, state) return singer_messages diff --git a/docs/examples/sources/sql_query.py b/docs/examples/archive/sources/sql_query.py similarity index 100% rename from docs/examples/sources/sql_query.py rename to docs/examples/archive/sources/sql_query.py diff --git a/docs/examples/sources/stdout.py b/docs/examples/archive/sources/stdout.py similarity index 100% rename from docs/examples/sources/stdout.py rename to docs/examples/archive/sources/stdout.py diff --git a/docs/examples/sync_schema_example.py b/docs/examples/archive/sync_schema_example.py similarity index 100% rename from docs/examples/sync_schema_example.py rename to docs/examples/archive/sync_schema_example.py diff --git a/docs/examples/incremental_loading/.dlt/config.toml b/docs/examples/incremental_loading/.dlt/config.toml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/incremental_loading/.dlt/secrets.toml b/docs/examples/incremental_loading/.dlt/secrets.toml new file mode 100644 index 0000000000..4dec919c06 --- /dev/null +++ b/docs/examples/incremental_loading/.dlt/secrets.toml @@ -0,0 +1,4 @@ +[sources.zendesk.credentials] +password = "" +subdomain = "" +email = "" \ No newline at end of file diff --git a/docs/examples/incremental_loading/__init__.py b/docs/examples/incremental_loading/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/incremental_loading/zendesk.py b/docs/examples/incremental_loading/zendesk.py new file mode 100644 index 0000000000..6370f29811 --- /dev/null +++ b/docs/examples/incremental_loading/zendesk.py @@ -0,0 +1,126 @@ +from typing import Iterator, Optional, Dict, Any, Tuple + +import dlt +from dlt.common import pendulum +from dlt.common.time import ensure_pendulum_datetime +from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime +from dlt.extract.source import DltResource +from dlt.sources.helpers.requests import client + + +@dlt.source(max_table_nesting=2) +def zendesk_support( + credentials: Dict[str, str]=dlt.secrets.value, + start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1), # noqa: B008 + end_date: Optional[TAnyDateTime] = None, +): + """ + Retrieves data from Zendesk Support for tickets events. + + Args: + credentials: Zendesk credentials (default: dlt.secrets.value) + start_date: Start date for data extraction (default: 2000-01-01) + end_date: End date for data extraction (default: None). + If end time is not provided, the incremental loading will be + enabled, and after the initial run, only new data will be retrieved. + + Returns: + DltResource. + """ + # Convert start_date and end_date to Pendulum datetime objects + start_date_obj = ensure_pendulum_datetime(start_date) + end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None + + # Convert Pendulum datetime objects to Unix timestamps + start_date_ts = start_date_obj.int_timestamp + end_date_ts: Optional[int] = None + if end_date_obj: + end_date_ts = end_date_obj.int_timestamp + + # Extract credentials from secrets dictionary + auth = (credentials["email"], credentials["password"]) + subdomain = credentials["subdomain"] + url = f"https://{subdomain}.zendesk.com" + + # we use `append` write disposition, because objects in ticket_events endpoint are never updated + # so we do not need to merge + # we set primary_key so allow deduplication of events by the `incremental` below in the rare case + # when two events have the same timestamp + @dlt.resource(primary_key="id", write_disposition="append") + def ticket_events( + timestamp: dlt.sources.incremental[int] = dlt.sources.incremental( + "timestamp", + initial_value=start_date_ts, + end_value=end_date_ts, + allow_external_schedulers=True, + ), + ): + # URL For ticket events + # 'https://d3v-dlthub.zendesk.com/api/v2/incremental/ticket_events.json?start_time=946684800' + event_pages = get_pages( + url=url, + endpoint="/api/v2/incremental/ticket_events.json", + auth=auth, + data_point_name="ticket_events", + params={"start_time": timestamp.last_value}, + ) + for page in event_pages: + yield page + # stop loading when using end_value and end is reached. + # unfortunately, Zendesk API does not have the "end_time" parameter, so we stop iterating ourselves + if timestamp.end_out_of_range: + return + + return ticket_events + + +def get_pages( + url: str, + endpoint: str, + auth: Tuple[str, str], + data_point_name: str, + params: Optional[Dict[str, Any]] = None, +): + """ + Makes a request to a paginated endpoint and returns a generator of data items per page. + + Args: + url: The base URL. + endpoint: The url to the endpoint, e.g. /api/v2/calls + auth: Credentials for authentication. + data_point_name: The key which data items are nested under in the response object (e.g. calls) + params: Optional dict of query params to include in the request. + + Returns: + Generator of pages, each page is a list of dict data items. + """ + # update the page size to enable cursor pagination + params = params or {} + params["per_page"] = 1000 + headers = None + + # make request and keep looping until there is no next page + get_url = f"{url}{endpoint}" + while get_url: + response = client.get( + get_url, headers=headers, auth=auth, params=params + ) + response.raise_for_status() + response_json = response.json() + result = response_json[data_point_name] + yield result + + get_url = None + # See https://developer.zendesk.com/api-reference/ticketing/ticket-management/incremental_exports/#json-format + if not response_json["end_of_stream"]: + get_url = response_json["next_page"] + + +if __name__ == "__main__": + # create dlt pipeline + pipeline = dlt.pipeline( + pipeline_name="zendesk", destination="duckdb", dataset_name="zendesk_data" + ) + + load_info = pipeline.run(zendesk_support()) + print(load_info) \ No newline at end of file diff --git a/docs/examples/transformers/.dlt/config.toml b/docs/examples/transformers/.dlt/config.toml new file mode 100644 index 0000000000..a366f34edf --- /dev/null +++ b/docs/examples/transformers/.dlt/config.toml @@ -0,0 +1,16 @@ +[runtime] +log_level="WARNING" + +[extract] +# use 2 workers to extract sources in parallel +worker=2 +# allow 10 async items to be processed in parallel +max_parallel_items=10 + +[normalize] +# use 3 worker processes to process 3 files in parallel +workers=3 + +[load] +# have 50 concurrent load jobs +workers=50 \ No newline at end of file diff --git a/docs/examples/transformers/__init__.py b/docs/examples/transformers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/transformers/pokemon.py b/docs/examples/transformers/pokemon.py new file mode 100644 index 0000000000..ce8cc0142c --- /dev/null +++ b/docs/examples/transformers/pokemon.py @@ -0,0 +1,61 @@ +import dlt +from dlt.sources.helpers import requests + + +@dlt.source(max_table_nesting=2) +def source(pokemon_api_url: str): + """""" + + # note that we deselect `pokemon_list` - we do not want it to be loaded + @dlt.resource(write_disposition="replace", selected=False) + def pokemon_list(): + """Retrieve a first page of Pokemons and yield it. We do not retrieve all the pages in this example""" + yield requests.get(pokemon_api_url).json()["results"] + + # transformer that retrieves a list of objects in parallel + @dlt.transformer + def pokemon(pokemons): + """Yields details for a list of `pokemons`""" + + # @dlt.defer marks a function to be executed in parallel + # in a thread pool + @dlt.defer + def _get_pokemon(_pokemon): + return requests.get(_pokemon["url"]).json() + + # call and yield the function result normally, the @dlt.defer takes care of parallelism + for _pokemon in pokemons: + yield _get_pokemon(_pokemon) + + # a special case where just one item is retrieved in transformer + # a whole transformer may be marked for parallel execution + @dlt.transformer + @dlt.defer + def species(pokemon_details): + """Yields species details for a pokemon""" + species_data = requests.get(pokemon_details["species"]["url"]).json() + # link back to pokemon so we have a relation in loaded data + species_data["pokemon_id"] = pokemon_details["id"] + # just return the results, if you yield, + # generator will be evaluated in main thread + return species_data + + # create two simple pipelines with | operator + # 1. send list of pokemons into `pokemon` transformer to get pokemon details + # 2. send pokemon details into `species` transformer to get species details + # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once + + return ( + pokemon_list | pokemon, + pokemon_list | pokemon | species + ) + +if __name__ == "__main__": + # build duck db pipeline + pipeline = dlt.pipeline( + pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" + ) + + # the pokemon_list resource does not need to be loaded + load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon")) + print(load_info) \ No newline at end of file diff --git a/docs/examples/transformers_and_parallelism/__init__.py b/docs/examples/transformers_and_parallelism/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/technical/create_pipeline.md b/docs/technical/create_pipeline.md index 59e5742680..f6603d08b8 100644 --- a/docs/technical/create_pipeline.md +++ b/docs/technical/create_pipeline.md @@ -420,12 +420,12 @@ The Python function that yields is not a function but magical object that `dlt` ```python def lazy_function(endpoint_name): - # INIT - this will be executed only once when DLT wants! + # INIT - this will be executed only once when dlt wants! get_configuration() from_item = dlt.current.state.get("last_item", 0) l = get_item_list_from_api(api_key, endpoint_name) - # ITERATOR - this will be executed many times also when DLT wants more data! + # ITERATOR - this will be executed many times also when dlt wants more data! for item in l: yield requests.get(url, api_key, "%s?id=%s" % (endpoint_name, item["id"])).json() # CLEANUP diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 275e36736e..74771ba74f 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -6,7 +6,7 @@ keywords: [aws, athena, glue catalog] # AWS Athena / Glue Catalog -The athena destination stores data as parquet files in s3 buckets and creates [external tables in aws athena](https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html). You can then query those tables with athena sql commands which will then scan the whole folder of parquet files and return the results. This destination works very similar to other sql based destinations, with the exception of the merge write disposition not being supported at this time. DLT metadata will be stored in the same bucket as the parquet files, but as iceberg tables. +The athena destination stores data as parquet files in s3 buckets and creates [external tables in aws athena](https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html). You can then query those tables with athena sql commands which will then scan the whole folder of parquet files and return the results. This destination works very similar to other sql based destinations, with the exception of the merge write disposition not being supported at this time. dlt metadata will be stored in the same bucket as the parquet files, but as iceberg tables. ## Setup Guide ### 1. Initialize the dlt project diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 39720670e2..709686e220 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -118,7 +118,7 @@ BigQuery supports the following [column hints](https://dlthub.com/docs/general-u ## Staging Support -BigQuery supports gcs as a file staging destination. DLT will upload files in the parquet format to gcs and ask BigQuery to copy their data directly into the db. Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your gcs bucket with the bucket_url and credentials. If you use the same service account for gcs and your redshift deployment, you do not need to provide additional authentication for BigQuery to be able to read from your bucket. +BigQuery supports gcs as a file staging destination. dlt will upload files in the parquet format to gcs and ask BigQuery to copy their data directly into the db. Please refer to the [Google Storage filesystem documentation](./filesystem.md#google-storage) to learn how to set up your gcs bucket with the bucket_url and credentials. If you use the same service account for gcs and your redshift deployment, you do not need to provide additional authentication for BigQuery to be able to read from your bucket. ```toml ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index 7b0f5daaea..ff29407d48 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -90,7 +90,7 @@ Amazon Redshift supports the following column hints: ## Staging support -Redshift supports s3 as a file staging destination. DLT will upload files in the parquet format to s3 and ask redshift to copy their data directly into the db. Please refere to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the aws credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. +Redshift supports s3 as a file staging destination. dlt will upload files in the parquet format to s3 and ask redshift to copy their data directly into the db. Please refere to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the aws credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. ### Authentication iam Role diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 7df4275898..47d3aabf26 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -126,7 +126,7 @@ Names of tables and columns in [schemas](../../general-usage/schema.md) are kept ## Staging support -Snowflake supports s3 and gcs as a file staging destinations. DLT will upload files in the parquet format to the bucket provider and will ask snowflake to copy their data directly into the db. +Snowflake supports s3 and gcs as a file staging destinations. dlt will upload files in the parquet format to the bucket provider and will ask snowflake to copy their data directly into the db. Alternavitely to parquet files, you can also specify jsonl as the staging file format. For this set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md index 7cf6bfd30d..d2ed03a2a2 100644 --- a/docs/website/docs/dlt-ecosystem/staging.md +++ b/docs/website/docs/dlt-ecosystem/staging.md @@ -7,7 +7,7 @@ keywords: [staging, destination] The goal of staging is to bring the data closer to the database engine so the modification of the destination (final) dataset happens faster and without errors. `dlt`, when asked, creates two staging areas: -1. A **staging dataset** used by the [merge and replace loads](../general-usage/incremental-loading.md#merge-incremental-loading) to deduplicate and merge data with the destination. Such staging dataset has the same name as the dataset passed to `dlt.pipeline` but with `_staging` suffix in the name. As a user you typically never see and directly interact with it. +1. A **staging dataset** used by the [merge and replace loads](../general-usage/incremental-loading.md#merge-incremental_loading) to deduplicate and merge data with the destination. Such staging dataset has the same name as the dataset passed to `dlt.pipeline` but with `_staging` suffix in the name. As a user you typically never see and directly interact with it. 2. A **staging storage** which is typically a s3/gcp bucket where [loader files](file-formats/) are copied before they are loaded by the destination. ## Staging storage diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/github.md b/docs/website/docs/dlt-ecosystem/verified-sources/github.md index 8ffe3af68c..f68d872493 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/github.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/github.md @@ -233,7 +233,7 @@ def repo_events( dlt.sources.incremental. If no value is given, the default "initial_value" is used. The function "last_value_func" determines the most recent 'created_at' value. -Read more about [incremental loading](../../general-usage/incremental-loading#incremental-loading-with-last-value). +Read more about [incremental loading](../../general-usage/incremental-loading#incremental_loading-with-last-value). ## Customization diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md index 15b79850f2..aaf823b702 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md @@ -380,7 +380,7 @@ dlt.resource( `merge_key`: Parameter is used to specify the column used to identify records for merging. In this case,"spreadsheet_id", means that the records will be merged based on the values in this column. -[Read more](https://dlthub.com/docs/general-usage/incremental-loading#merge-incremental-loading). +[Read more](https://dlthub.com/docs/general-usage/incremental-loading#merge-incremental_loading). ### Create your own pipeline diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index 836b491c50..fa432f7e52 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -231,4 +231,4 @@ To create your data pipeline using single loading and [incremental data loading 6. It's important to keep the pipeline name and destination dataset name unchanged. The pipeline name is crucial for retrieving the [state](https://dlthub.com/docs/general-usage/state) of the last pipeline run, which includes the end date needed for loading data incrementally. Modifying these names can lead to [“full_refresh”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh) which will disrupt the tracking of relevant metadata(state) for [incremental data loading](https://dlthub.com/docs/general-usage/incremental-loading). -That's it! Enjoy running your Stripe DLT pipeline! +That's it! Enjoy running your Stripe dlt pipeline! diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index dd25fd51f7..419d2ec924 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -226,4 +226,4 @@ To create your data pipeline using single loading and [incremental data loading] > Note: In the pipeline run mentioned above, we are loading two [dependent endpoints](workable.md#dependent-endpoints). “jobs_activities” and “jobs_application_form”. To ensure that these endpoints are loaded properly, the **`load_details`** parameter is set as **`True`**. -That’s it! Enjoy running your Workable DLT pipeline! +That’s it! Enjoy running your Workable dlt pipeline! diff --git a/docs/website/docs/examples/_examples-header.md b/docs/website/docs/examples/_examples-header.md new file mode 100644 index 0000000000..9840b00d29 --- /dev/null +++ b/docs/website/docs/examples/_examples-header.md @@ -0,0 +1,21 @@ +import Admonition from "@theme/Admonition"; +import CodeBlock from '@theme/CodeBlock'; + + + The source code for this example can be found in our repository at: {"https://github.com/dlt-hub/dlt/tree/devel/docs/examples/" + props.slug}. + + +## TLDR +
{props.intro}
+ +## Setup: Running this example on your machine + +{`# clone the dlt repository +git clone git@github.com:dlt-hub/dlt.git +# go to example directory +cd ./dlt/docs/examples/${props.slug} +# install dlt with duckdb +pip install "dlt[duckdb]" +# run the example script +python run.py`} + diff --git a/docs/website/docs/examples/incremental_loading/__init__.py b/docs/website/docs/examples/incremental_loading/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/examples/incremental_loading/code/.dlt/config.toml b/docs/website/docs/examples/incremental_loading/code/.dlt/config.toml new file mode 100644 index 0000000000..be627e6c11 --- /dev/null +++ b/docs/website/docs/examples/incremental_loading/code/.dlt/config.toml @@ -0,0 +1,2 @@ +# @@@DLT_SNIPPET_START example +# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/incremental_loading/code/.dlt/secrets.toml b/docs/website/docs/examples/incremental_loading/code/.dlt/secrets.toml new file mode 100644 index 0000000000..caf8d523c4 --- /dev/null +++ b/docs/website/docs/examples/incremental_loading/code/.dlt/secrets.toml @@ -0,0 +1,6 @@ +# @@@DLT_SNIPPET_START example +[sources.zendesk.credentials] +password = "" +subdomain = "" +email = "" +# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/incremental_loading/code/__init__.py b/docs/website/docs/examples/incremental_loading/code/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py new file mode 100644 index 0000000000..2d674407bc --- /dev/null +++ b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py @@ -0,0 +1,148 @@ +from tests.utils import skipifgithubfork + +# because the example below uses credentials and it is copied to the module zendesk.py +# we force the same config section name +__source_name__ = "zendesk" + + +@skipifgithubfork +def incremental_snippet() -> None: + + # @@@DLT_SNIPPET_START example + # @@@DLT_SNIPPET_START markdown_source + from typing import Iterator, Optional, Dict, Any, Tuple + + import dlt + from dlt.common import pendulum + from dlt.common.time import ensure_pendulum_datetime + from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime + from dlt.extract.source import DltResource + from dlt.sources.helpers.requests import client + + + @dlt.source(max_table_nesting=2) + def zendesk_support( + credentials: Dict[str, str]=dlt.secrets.value, + start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1), # noqa: B008 + end_date: Optional[TAnyDateTime] = None, + ): + """ + Retrieves data from Zendesk Support for tickets events. + + Args: + credentials: Zendesk credentials (default: dlt.secrets.value) + start_date: Start date for data extraction (default: 2000-01-01) + end_date: End date for data extraction (default: None). + If end time is not provided, the incremental loading will be + enabled, and after the initial run, only new data will be retrieved. + + Returns: + DltResource. + """ + # Convert start_date and end_date to Pendulum datetime objects + start_date_obj = ensure_pendulum_datetime(start_date) + end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None + + # Convert Pendulum datetime objects to Unix timestamps + start_date_ts = start_date_obj.int_timestamp + end_date_ts: Optional[int] = None + if end_date_obj: + end_date_ts = end_date_obj.int_timestamp + + # Extract credentials from secrets dictionary + auth = (credentials["email"], credentials["password"]) + subdomain = credentials["subdomain"] + url = f"https://{subdomain}.zendesk.com" + + # we use `append` write disposition, because objects in ticket_events endpoint are never updated + # so we do not need to merge + # we set primary_key so allow deduplication of events by the `incremental` below in the rare case + # when two events have the same timestamp + @dlt.resource(primary_key="id", write_disposition="append") + def ticket_events( + timestamp: dlt.sources.incremental[int] = dlt.sources.incremental( + "timestamp", + initial_value=start_date_ts, + end_value=end_date_ts, + allow_external_schedulers=True, + ), + ): + # URL For ticket events + # 'https://d3v-dlthub.zendesk.com/api/v2/incremental/ticket_events.json?start_time=946684800' + event_pages = get_pages( + url=url, + endpoint="/api/v2/incremental/ticket_events.json", + auth=auth, + data_point_name="ticket_events", + params={"start_time": timestamp.last_value}, + ) + for page in event_pages: + yield page + # stop loading when using end_value and end is reached. + # unfortunately, Zendesk API does not have the "end_time" parameter, so we stop iterating ourselves + if timestamp.end_out_of_range: + return + + return ticket_events + + # @@@DLT_SNIPPET_END markdown_source + + def get_pages( + url: str, + endpoint: str, + auth: Tuple[str, str], + data_point_name: str, + params: Optional[Dict[str, Any]] = None, + ): + """ + Makes a request to a paginated endpoint and returns a generator of data items per page. + + Args: + url: The base URL. + endpoint: The url to the endpoint, e.g. /api/v2/calls + auth: Credentials for authentication. + data_point_name: The key which data items are nested under in the response object (e.g. calls) + params: Optional dict of query params to include in the request. + + Returns: + Generator of pages, each page is a list of dict data items. + """ + # update the page size to enable cursor pagination + params = params or {} + params["per_page"] = 1000 + headers = None + + # make request and keep looping until there is no next page + get_url = f"{url}{endpoint}" + while get_url: + response = client.get( + get_url, headers=headers, auth=auth, params=params + ) + response.raise_for_status() + response_json = response.json() + result = response_json[data_point_name] + yield result + + get_url = None + # See https://developer.zendesk.com/api-reference/ticketing/ticket-management/incremental_exports/#json-format + if not response_json["end_of_stream"]: + get_url = response_json["next_page"] + + + # @@@DLT_SNIPPET_START markdown_pipeline + __name__ = "__main__" # @@@DLT_REMOVE + if __name__ == "__main__": + # create dlt pipeline + pipeline = dlt.pipeline( + pipeline_name="zendesk", destination="duckdb", dataset_name="zendesk_data" + ) + + load_info = pipeline.run(zendesk_support()) + print(load_info) + # @@@DLT_SNIPPET_END markdown_pipeline + # @@@DLT_SNIPPET_END example + + # check that stuff was loaded + row_counts = pipeline.last_trace.last_normalize_info.row_counts + assert row_counts["ticket_events"] == 24 + diff --git a/docs/website/docs/examples/incremental_loading/index.md b/docs/website/docs/examples/incremental_loading/index.md new file mode 100644 index 0000000000..826cb60d6c --- /dev/null +++ b/docs/website/docs/examples/incremental_loading/index.md @@ -0,0 +1,126 @@ +--- +title: Load Zendesk tickets incrementally +description: Learn how do incremental loading in consecutive runs +keywords: [incremental loading, example] +--- + +import Header from '../_examples-header.md'; + +
+ +## Incremental loading with the Zendesk API + +In this example, you'll find a Python script that interacts with the Zendesk Support API to extract ticket events data. + +We'll learn: + +- How to pass [credentials](../../general-usage/credentials) as dict and how to type the `@dlt.source` function arguments. +- How to set [the nesting level](../../general-usage/source#reduce-the-nesting-level-of-generated-tables). +- How to enable [incremental loading](../../general-usage/incremental-loading) for efficient data extraction. +- How to specify [the start and end dates](../../general-usage/incremental-loading#using-dltsourcesincremental-for-backfill) for the data loading and how to [opt-in to Airflow scheduler](../../general-usage/incremental-loading#using-airflow-schedule-for-backfill-and-incremental-loading) by setting `allow_external_schedulers` to `True`. +- How to work with timestamps, specifically converting them to Unix timestamps for incremental data extraction. +- How to use the `start_time` parameter in API requests to retrieve data starting from a specific timestamp. + + +### Loading code + + +```py +from typing import Iterator, Optional, Dict, Any, Tuple + +import dlt +from dlt.common import pendulum +from dlt.common.time import ensure_pendulum_datetime +from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime +from dlt.extract.source import DltResource +from dlt.sources.helpers.requests import client + + +@dlt.source(max_table_nesting=2) +def zendesk_support( + credentials: Dict[str, str]=dlt.secrets.value, + start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1), # noqa: B008 + end_date: Optional[TAnyDateTime] = None, +): + """ + Retrieves data from Zendesk Support for tickets events. + + Args: + credentials: Zendesk credentials (default: dlt.secrets.value) + start_date: Start date for data extraction (default: 2000-01-01) + end_date: End date for data extraction (default: None). + If end time is not provided, the incremental loading will be + enabled, and after the initial run, only new data will be retrieved. + + Returns: + DltResource. + """ + # Convert start_date and end_date to Pendulum datetime objects + start_date_obj = ensure_pendulum_datetime(start_date) + end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None + + # Convert Pendulum datetime objects to Unix timestamps + start_date_ts = start_date_obj.int_timestamp + end_date_ts: Optional[int] = None + if end_date_obj: + end_date_ts = end_date_obj.int_timestamp + + # Extract credentials from secrets dictionary + auth = (credentials["email"], credentials["password"]) + subdomain = credentials["subdomain"] + url = f"https://{subdomain}.zendesk.com" + + # we use `append` write disposition, because objects in ticket_events endpoint are never updated + # so we do not need to merge + # we set primary_key so allow deduplication of events by the `incremental` below in the rare case + # when two events have the same timestamp + @dlt.resource(primary_key="id", write_disposition="append") + def ticket_events( + timestamp: dlt.sources.incremental[int] = dlt.sources.incremental( + "timestamp", + initial_value=start_date_ts, + end_value=end_date_ts, + allow_external_schedulers=True, + ), + ): + # URL For ticket events + # 'https://d3v-dlthub.zendesk.com/api/v2/incremental/ticket_events.json?start_time=946684800' + event_pages = get_pages( + url=url, + endpoint="/api/v2/incremental/ticket_events.json", + auth=auth, + data_point_name="ticket_events", + params={"start_time": timestamp.last_value}, + ) + for page in event_pages: + yield page + # stop loading when using end_value and end is reached. + # unfortunately, Zendesk API does not have the "end_time" parameter, so we stop iterating ourselves + if timestamp.end_out_of_range: + return + + return ticket_events +``` + + +Run the pipeline: + + + +```py +if __name__ == "__main__": + # create dlt pipeline + pipeline = dlt.pipeline( + pipeline_name="zendesk", destination="duckdb", dataset_name="zendesk_data" + ) + + load_info = pipeline.run(zendesk_support()) + print(load_info) +``` + + diff --git a/docs/website/docs/examples/transformers/__init__.py b/docs/website/docs/examples/transformers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/examples/transformers/code/.dlt/config.toml b/docs/website/docs/examples/transformers/code/.dlt/config.toml new file mode 100644 index 0000000000..07b2c84ad4 --- /dev/null +++ b/docs/website/docs/examples/transformers/code/.dlt/config.toml @@ -0,0 +1,18 @@ +# @@@DLT_SNIPPET_START example +[runtime] +log_level="WARNING" + +[extract] +# use 2 workers to extract sources in parallel +worker=2 +# allow 10 async items to be processed in parallel +max_parallel_items=10 + +[normalize] +# use 3 worker processes to process 3 files in parallel +workers=3 + +[load] +# have 50 concurrent load jobs +workers=50 +# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/transformers/code/__init__.py b/docs/website/docs/examples/transformers/code/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/website/docs/examples/transformers/code/pokemon-snippets.py b/docs/website/docs/examples/transformers/code/pokemon-snippets.py new file mode 100644 index 0000000000..726bcf7e2e --- /dev/null +++ b/docs/website/docs/examples/transformers/code/pokemon-snippets.py @@ -0,0 +1,73 @@ + +def transformers_snippet() -> None: + + # @@@DLT_SNIPPET_START example + import dlt + from dlt.sources.helpers import requests + + + @dlt.source(max_table_nesting=2) + def source(pokemon_api_url: str): + """""" + + # note that we deselect `pokemon_list` - we do not want it to be loaded + @dlt.resource(write_disposition="replace", selected=False) + def pokemon_list(): + """Retrieve a first page of Pokemons and yield it. We do not retrieve all the pages in this example""" + yield requests.get(pokemon_api_url).json()["results"] + + # transformer that retrieves a list of objects in parallel + @dlt.transformer + def pokemon(pokemons): + """Yields details for a list of `pokemons`""" + + # @dlt.defer marks a function to be executed in parallel + # in a thread pool + @dlt.defer + def _get_pokemon(_pokemon): + return requests.get(_pokemon["url"]).json() + + # call and yield the function result normally, the @dlt.defer takes care of parallelism + for _pokemon in pokemons: + yield _get_pokemon(_pokemon) + + # a special case where just one item is retrieved in transformer + # a whole transformer may be marked for parallel execution + @dlt.transformer + @dlt.defer + def species(pokemon_details): + """Yields species details for a pokemon""" + species_data = requests.get(pokemon_details["species"]["url"]).json() + # link back to pokemon so we have a relation in loaded data + species_data["pokemon_id"] = pokemon_details["id"] + # just return the results, if you yield, + # generator will be evaluated in main thread + return species_data + + # create two simple pipelines with | operator + # 1. send list of pokemons into `pokemon` transformer to get pokemon details + # 2. send pokemon details into `species` transformer to get species details + # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once + + return ( + pokemon_list | pokemon, + pokemon_list | pokemon | species + ) + + __name__ = "__main__" # @@@DLT_REMOVE + if __name__ == "__main__": + # build duck db pipeline + pipeline = dlt.pipeline( + pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" + ) + + # the pokemon_list resource does not need to be loaded + load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon")) + print(load_info) + # @@@DLT_SNIPPET_END example + + # test assertions + row_counts = pipeline.last_trace.last_normalize_info.row_counts + assert row_counts["pokemon"] == 20 + assert row_counts["species"] == 20 + assert "pokemon_list" not in row_counts diff --git a/docs/website/docs/examples/transformers/index.md b/docs/website/docs/examples/transformers/index.md new file mode 100644 index 0000000000..9ed767e4ca --- /dev/null +++ b/docs/website/docs/examples/transformers/index.md @@ -0,0 +1,116 @@ +--- +title: Pokemon details in parallel using transformers +description: Learn how to use dlt transformers and how to speed up your loads with parallelism +keywords: [transformers, parallelism, example] +--- + +import Header from '../_examples-header.md'; + +
+ + +## Using transformers with the pokemon api + +For this example we will be loading pokemon data from the [Poke Api](https://pokeapi.co/) with the help of transformers to load +pokemon details in parallel + +We'll learn how to: +- create 2 [transformers](../../general-usage/resource.md#feeding-data-from-one-resource-into-another) and connect them to a resource with the pipe operator `|` +- [load these transformers in parallel](../../reference/performance.md#parallelism) using the `@dlt.defer` decorator +- [configure parallelism](../../reference/performance.md#parallel-pipeline-config-example) in the `config.toml` file +- deselect the main resource so it will not be loaded into the database. +- importing and using a pre-configured `requests` library with automatic retries (`from dlt.sources.helpers import requests`) + +### Loading code + + +```py +import dlt +from dlt.sources.helpers import requests + + +@dlt.source(max_table_nesting=2) +def source(pokemon_api_url: str): + """""" + + # note that we deselect `pokemon_list` - we do not want it to be loaded + @dlt.resource(write_disposition="replace", selected=False) + def pokemon_list(): + """Retrieve a first page of Pokemons and yield it. We do not retrieve all the pages in this example""" + yield requests.get(pokemon_api_url).json()["results"] + + # transformer that retrieves a list of objects in parallel + @dlt.transformer + def pokemon(pokemons): + """Yields details for a list of `pokemons`""" + + # @dlt.defer marks a function to be executed in parallel + # in a thread pool + @dlt.defer + def _get_pokemon(_pokemon): + return requests.get(_pokemon["url"]).json() + + # call and yield the function result normally, the @dlt.defer takes care of parallelism + for _pokemon in pokemons: + yield _get_pokemon(_pokemon) + + # a special case where just one item is retrieved in transformer + # a whole transformer may be marked for parallel execution + @dlt.transformer + @dlt.defer + def species(pokemon_details): + """Yields species details for a pokemon""" + species_data = requests.get(pokemon_details["species"]["url"]).json() + # link back to pokemon so we have a relation in loaded data + species_data["pokemon_id"] = pokemon_details["id"] + # just return the results, if you yield, + # generator will be evaluated in main thread + return species_data + + # create two simple pipelines with | operator + # 1. send list of pokemons into `pokemon` transformer to get pokemon details + # 2. send pokemon details into `species` transformer to get species details + # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once + + return ( + pokemon_list | pokemon, + pokemon_list | pokemon | species + ) + +if __name__ == "__main__": + # build duck db pipeline + pipeline = dlt.pipeline( + pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" + ) + + # the pokemon_list resource does not need to be loaded + load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon")) + print(load_info) +``` + + + +### config.toml with examples how to configure parallelism + +```toml +[runtime] +log_level="WARNING" + +[extract] +# use 2 workers to extract sources in parallel +worker=2 +# allow 10 async items to be processed in parallel +max_parallel_items=10 + +[normalize] +# use 3 worker processes to process 3 files in parallel +workers=3 + +[load] +# have 50 concurrent load jobs +workers=50 +``` + diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index 7491699678..305eb29da7 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -279,9 +279,9 @@ assert list(r) == list(range(10)) You can change the schema of a resource, be it standalone or as a part of a source. Look for method named `apply_hints` which takes the same arguments as resource decorator. Obviously you should call this method before data is extracted from the resource. Example below converts an `append` resource -loading the `users` table into [merge](incremental-loading.md#merge-incremental-loading) resource +loading the `users` table into [merge](incremental-loading.md#merge-incremental_loading) resource that will keep just one updated record per `user_id`. It also adds -["last value" incremental loading](incremental-loading.md#incremental-loading-with-last-value) on +["last value" incremental loading](incremental-loading.md#incremental_loading-with-last-value) on `created_at` column to prevent requesting again the already loaded records: ```python diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 4fd84fb288..e27a87e803 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -105,7 +105,7 @@ A column schema contains following basic hints: 1. `nullable` tells if column is nullable or not. 1. `primary_key` marks a column as a part of primary key. 1. `merge_key` marks a column as a part of merge key used by. - [incremental load](./incremental-loading.md#merge-incremental-loading) + [incremental load](./incremental-loading.md#merge-incremental_loading) 1. `foreign_key` marks a column as a part of foreign key. 1. `root_key` marks a column as a part of root key which is a type of foreign key always referring to the root table. diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md index 8b039d9f3d..23625db27c 100644 --- a/docs/website/docs/general-usage/state.md +++ b/docs/website/docs/general-usage/state.md @@ -88,7 +88,7 @@ about the pipeline, pipeline run (that the state belongs to) and state blob. ## When to use pipeline state - `dlt` uses the state internally to implement - [last value incremental loading](incremental-loading.md#incremental-loading-with-last-value). This + [last value incremental loading](incremental-loading.md#incremental_loading-with-last-value). This use case should cover around 90% of your needs to use the pipeline state. - [Store a list of already requested entities](incremental-loading.md#advanced-state-usage-storing-a-list-of-processed-entities) if the list is not much bigger than 100k elements. diff --git a/docs/website/docs/getting-started.md b/docs/website/docs/getting-started.md index c027cf134d..9f0c67cc43 100644 --- a/docs/website/docs/getting-started.md +++ b/docs/website/docs/getting-started.md @@ -372,7 +372,7 @@ dlt pipeline -v github_issues_incremental info Learn more: - Declare your [resources](general-usage/resource) and group them in [sources](general-usage/source) using Python decorators. -- [Set up "last value" incremental loading.](general-usage/incremental-loading#incremental-loading-with-last-value) +- [Set up "last value" incremental loading.](general-usage/incremental-loading#incremental_loading-with-last-value) - [Inspect pipeline after loading.](walkthroughs/run-a-pipeline#4-inspect-a-load-process) - [`dlt` command line interface.](reference/command-line-interface) @@ -432,7 +432,7 @@ and `updated_at.last_value` to tell GitHub to return issues updated only **after Learn more: -- [You can do way more with merge.](general-usage/incremental-loading#merge-incremental-loading) +- [You can do way more with merge.](general-usage/incremental-loading#merge-incremental_loading) ### Dispatch stream of events to tables by event type @@ -635,7 +635,7 @@ If you want to take full advantage of the `dlt` library, then we strongly sugges - [Create your resources dynamically from data](general-usage/source#create-resources-dynamically). - [Append, replace and merge your tables](general-usage/incremental-loading). - [Transform your data before loading](general-usage/resource#customize-resources) and see some [examples of customizations like column renames and anonymization](general-usage/customising-pipelines/renaming_columns). -- [Set up "last value" incremental loading](general-usage/incremental-loading#incremental-loading-with-last-value). +- [Set up "last value" incremental loading](general-usage/incremental-loading#incremental_loading-with-last-value). - [Set primary and merge keys, define the columns nullability and data types](general-usage/resource#define-schema). - [Pass config and credentials into your sources and resources](general-usage/credentials). - [Use built-in requests client](reference/performance#using-the-built-in-requests-client). diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index 7ae5121489..e31755f206 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -326,7 +326,6 @@ if __name__ == "__main__" or "PYTEST_CURRENT_TEST" in os.environ: ``` - ### Source decomposition for serial and parallel resource execution You can decompose a pipeline into strongly connected components with diff --git a/docs/website/docusaurus.config.js b/docs/website/docusaurus.config.js index 99edf1d75d..31f91d6feb 100644 --- a/docs/website/docusaurus.config.js +++ b/docs/website/docusaurus.config.js @@ -84,12 +84,6 @@ const config = { label: 'Docs', }, { to: 'blog', label: 'Blog', position: 'left' }, - { - href:'https://colab.research.google.com/drive/1NfSB1DpwbbHX9_t5vlalBTf13utwpMGx?usp=sharing', - label: 'Colab demo', - position:'right', - className: 'colab-demo', - }, { href: 'https://join.slack.com/t/dlthub-community/shared_invite/zt-1slox199h-HAE7EQoXmstkP_bTqal65g', label: '.', diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index a1751ae5f9..80e0fbd19d 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -220,6 +220,21 @@ const sidebars = { 'running-in-production/tracing', ], }, + { + type: 'category', + label: 'Code Examples', + link: { + type: 'generated-index', + title: 'Code Examples', + description: 'A list of comprehensive code examples that teach you how to solve real world problem.', + slug: 'examples', + keywords: ['examples'], + }, + items: [ + 'examples/transformers/index', + 'examples/incremental_loading/index', + ], + }, { type: 'category', label: 'Reference', diff --git a/docs/website/src/css/custom.css b/docs/website/src/css/custom.css index 9351897c4e..b678471fc2 100644 --- a/docs/website/src/css/custom.css +++ b/docs/website/src/css/custom.css @@ -273,38 +273,24 @@ html[data-theme='dark'] .footer--dark { text-decoration: none; } -.colab-demo { +.examples-link { font-family: 'Karla', sans-serif !important; background: var(--ifm-color-primary); border-radius: 8px; color: #fff; margin-right: 2px; margin-left: 10px; - padding: 6px 10px 6px 17px; + padding: 6px 10px 6px 10px; display: flex; } -.colab-demo::before { - content: ''; - width: 25px; - height: 25px; - display: flex; - background: url("../../static/img/CO-White.svg") no-repeat; - position: relative; - left: -5px; - top: 0px; -} - -html[data-theme='dark'] .colab-demo:hover { +html[data-theme='dark'] .examples-link:hover { color: #191937; - } - -html[data-theme='dark'] .colab-demo:hover::before { - background: url("../../static/img/CO-Blue.svg") no-repeat; } + .theme-admonition.theme-admonition-tip.alert svg path{ fill: #191937; } @@ -318,7 +304,7 @@ html[data-theme='dark'] .colab-demo:hover::before { padding: 0 !important; } */ -.colab-demo:hover { +.examples-link:hover { background: var(--ifm-color-primary-light); color: #fff; } @@ -468,7 +454,7 @@ html[data-theme='dark'] .slack-navbar::before { .menu__link{padding-bottom: 15px !important;} -.colab-demo>svg { +.examples-link>svg { display: none; } @@ -652,23 +638,43 @@ html[data-theme='dark'] .theme-doc-sidebar-menu.menu__list>li:nth-child(7)>div>[ background-image: url(../../static/img/RunningInProduction-Active-1.svg); } -/* Reference */ +/* Code Examples */ .theme-doc-sidebar-menu.menu__list>li:nth-child(8)>div>a::before { - background-image: url(../../static/img/Reference-Inactive.svg); + background-image: url(../../static/img/Howdltworks-Inactive.svg); } .theme-doc-sidebar-menu.menu__list>li:hover:nth-child(8)>div>a::before, -.theme-doc-sidebar-menu.menu__list>li:nth-child(9)>div>[aria-expanded="true"]::before { - background-image: url(../../static/img/Reference-Active.svg); +.theme-doc-sidebar-menu.menu__list>li:nth-child(8)>div>[aria-expanded="true"]::before { + background-image: url(../../static/img/Howdltworks-Active.svg); } html[data-theme='dark'] .theme-doc-sidebar-menu.menu__list>li:nth-child(8)>div>a::before { - background-image: url(../../static/img/Reference-Inactive-1.svg); + background-image: url(../../static/img/Howdltworks-Inactive-1.svg); } html[data-theme='dark'] .theme-doc-sidebar-menu.menu__list>li:hover:nth-child(8)>div>a::before, html[data-theme='dark'] .theme-doc-sidebar-menu.menu__list>li:nth-child(8)>div>[aria-expanded="true"]::before { + background-image: url(../../static/img/Howdltworks-Active-1.svg); +} + +/* Reference */ + +.theme-doc-sidebar-menu.menu__list>li:nth-child(9)>div>a::before { + background-image: url(../../static/img/Reference-Inactive.svg); +} + +.theme-doc-sidebar-menu.menu__list>li:hover:nth-child(9)>div>a::before, +.theme-doc-sidebar-menu.menu__list>li:nth-child(9)>div>[aria-expanded="true"]::before { + background-image: url(../../static/img/Reference-Active.svg); +} + +html[data-theme='dark'] .theme-doc-sidebar-menu.menu__list>li:nth-child(9)>div>a::before { + background-image: url(../../static/img/Reference-Inactive-1.svg); +} + +html[data-theme='dark'] .theme-doc-sidebar-menu.menu__list>li:hover:nth-child(9)>div>a::before, +html[data-theme='dark'] .theme-doc-sidebar-menu.menu__list>li:nth-child(9)>div>[aria-expanded="true"]::before { background-image: url(../../static/img/Reference-Active-1.svg); } @@ -676,9 +682,9 @@ html[data-theme='dark'] .theme-doc-sidebar-menu.menu__list>li:nth-child(8)>div>[ /* responsive css :start */ @media(min-width:0px) and (max-width:767px) { - .colab-demo { + .examples-link { font-size: 0; - padding: 4px 5px 4px 14px; + padding: 4px 5px 4px 5px; margin-right: 16px; margin-left: 0; } @@ -695,7 +701,7 @@ html[data-theme='dark'] .theme-doc-sidebar-menu.menu__list>li:nth-child(8)>div>[ background: var(--ifm-gradient); } - .menu__link.colab-demo { + .menu__link.examples-link { display: table; background: var(--ifm-color-primary) !important; padding-bottom: 4px!important; diff --git a/docs/website/src/theme/DLTExampleHeader.tsx b/docs/website/src/theme/DLTExampleHeader.tsx new file mode 100644 index 0000000000..eafd6f246b --- /dev/null +++ b/docs/website/src/theme/DLTExampleHeader.tsx @@ -0,0 +1,15 @@ +import React from 'react'; +import Heading from '@theme/Heading'; + +interface IProps { + title: string; + slug: string; +} + +export const DLTExampleHeader = ({title, slug} : IProps ) => { + return (<> + + Example: {contentTitle} + + ) +}; \ No newline at end of file diff --git a/docs/website/src/theme/MDXComponents.js b/docs/website/src/theme/MDXComponents.js index 8edd7610a5..350ddc7240 100644 --- a/docs/website/src/theme/MDXComponents.js +++ b/docs/website/src/theme/MDXComponents.js @@ -3,12 +3,14 @@ import React from 'react'; import MDXComponents from '@theme-original/MDXComponents'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; +import {DLTExampleHeader} from './DLTExampleHeader'; export default { // Re-use the default mapping ...MDXComponents, - Tabs: Tabs, - TabItem: TabItem, + Tabs, + TabItem, + DLTExampleHeader // Map the "" tag to our Highlight component // `Highlight` will receive all props that were passed to `` in MDX }; \ No newline at end of file diff --git a/docs/website/static/img/Howdltworks-active-1.svg b/docs/website/static/img/Howdltworks-Active-1.svg similarity index 100% rename from docs/website/static/img/Howdltworks-active-1.svg rename to docs/website/static/img/Howdltworks-Active-1.svg diff --git a/docs/website/tools/update_snippets.js b/docs/website/tools/update_snippets.js index cf91c5c263..1a085a02d5 100644 --- a/docs/website/tools/update_snippets.js +++ b/docs/website/tools/update_snippets.js @@ -3,10 +3,18 @@ const path = require('path'); const dedent = require('dedent'); var watch = require('node-watch'); +// docs snippets settings const BASE_DIR = "./docs/"; const DOCS_EXTENSIONS = [".md", ".mdx"]; const SNIPPETS_FILE_SUFFIX = "-snippets.py" +// examples settings +const EXAMPLES_SOURCE_DIR = "./docs/examples/"; +const EXAMPLES_DESTINATION_DIR = "../examples/"; +const EXAMPLES_MAIN_SNIPPET_NAME = "example"; +const EXAMPLES_CODE_SUBDIR = "/code"; + +// markers const DLT_MARKER = "@@@DLT"; const START_MARKER = DLT_MARKER + "_SNIPPET_START"; const END_MARKER = DLT_MARKER + "_SNIPPET_END"; @@ -23,6 +31,17 @@ function *walkSync(dir) { } } +function *listDirsSync(dir) { + const files = fs.readdirSync(dir, { withFileTypes: true }); + for (const file of files) { + if (file.isDirectory()) { + yield path.join(dir, file.name); + } + } +} + + + // extract the snippet name from a line const extractSnippetName = (tag, line) => { if (line && line.includes(tag)) { @@ -55,7 +74,7 @@ function buildSnippetMap(lines, fileName) { } if (snippetName = extractSnippetName(END_MARKER, line)) { if (snippetName in snippetMap) { - snippetMap[snippetName]["end"] = parseInt(lineIndex); + snippetMap[snippetName]["end"] = parseInt(lineIndex); } else { throw new Error(`Found end tag for snippet "${snippetName}" but start tag not found! File ${fileName}.`); } @@ -64,6 +83,20 @@ function buildSnippetMap(lines, fileName) { return snippetMap; } +function getSnippetFromFile(snippetsFileName, snippetName) { + const lines = fs.readFileSync(snippetsFileName, 'utf8').split(/\r?\n/); + const snippetMap = buildSnippetMap(lines, snippetsFileName); + + if (!(snippetName in snippetMap)) { + return undefined; + } + + let result = lines.slice((snippetMap[snippetName]["start"]+1), snippetMap[snippetName]["end"]); + // dedent works on strings, not on string arrays, so this is very ineffective unfortunately... + result = dedent(result.join("\n")).split(/\r?\n/); + return filterDirectives(result); +} + // get the right snippet for a file function getSnippet(fileName, snippetName) { const ext = path.extname(fileName); @@ -73,20 +106,16 @@ function getSnippet(fileName, snippetName) { snippetsFileName = path. dirname(fileName) + "/" + snippetParts[0]; snippetName = snippetParts[1]; } - const lines = fs.readFileSync(snippetsFileName, 'utf8').split(/\r?\n/); - const snippetMap = buildSnippetMap(lines, snippetsFileName); - - if (!(snippetName in snippetMap)) { + const snippet = getSnippetFromFile(snippetsFileName, snippetName); + if (!snippet) { throw new Error(`Could not find requested snippet "${snippetName}" requested in file ${fileName} in file ${snippetsFileName}.`); } - let result = lines.slice((snippetMap[snippetName]["start"]+1), snippetMap[snippetName]["end"]); - // dedent works on strings, not on string arrays, so this is very ineffective unfortunately... - result = dedent(result.join("\n")).split(/\r?\n/); const codeType = path.extname(snippetsFileName).replace(".", ""); - result.unshift(`\`\`\`${codeType}`); - result.push("```"); - return filterDirectives(result); + snippet.unshift(`\`\`\`${codeType}`); + snippet.push("```"); + + return snippet; } function insertSnippets(lines, fileName, onlyClear) { @@ -119,11 +148,11 @@ function insertSnippets(lines, fileName, onlyClear) { return [result, snippetsUpdated]; } -// update the snippets -function updateSnippets(dir) { +// update the snippets in the md files +function updateSnippets() { console.log("Updating Snippets"); let processedFiles = 0; - for (const fileName of walkSync(dir)) { + for (const fileName of walkSync(BASE_DIR)) { if (!DOCS_EXTENSIONS.includes(path.extname(fileName))) { continue } @@ -137,7 +166,37 @@ function updateSnippets(dir) { console.log(`Processed ${processedFiles} files.`); } -updateSnippets(BASE_DIR); +// sync examples from the website folders to docs/examples +function syncExamples() { + for (const exampleDir of listDirsSync(EXAMPLES_SOURCE_DIR)) { + const exampleName = exampleDir.split("/").slice(-1)[0]; + const exampleDestinationDir = EXAMPLES_DESTINATION_DIR + exampleName; + + // clear example destination dir + fs.rmSync(exampleDestinationDir, { recursive: true, force: true }); + // create __init__.py + fs.mkdirSync(exampleDestinationDir, { recursive: true }); + fs.writeFileSync(exampleDestinationDir + "/__init__.py", ""); + + // walk all files of example and copy to example destination + const exampleCodeDir = exampleDir + EXAMPLES_CODE_SUBDIR; + for (const fileName of walkSync(exampleCodeDir)) { + const lines = getSnippetFromFile(fileName, EXAMPLES_MAIN_SNIPPET_NAME); + if (!lines) { + continue; + } + + // write file + const destinationFileName = exampleDestinationDir + fileName.replace(exampleCodeDir, "").replace("-snippets", ""); + fs.mkdirSync(path.dirname(destinationFileName), { recursive: true }); + fs.writeFileSync(destinationFileName, lines.join("\n")); + } + + } +} + +updateSnippets(); +syncExamples(); if (process.argv.includes("--watch")) { console.log(`Watching ${BASE_DIR}`) @@ -148,7 +207,8 @@ if (process.argv.includes("--watch")) { return; } console.log('%s changed...', name); - updateSnippets(BASE_DIR); + updateSnippets(); + syncExamples(); lastUpdate = Date.now(); }); } \ No newline at end of file diff --git a/mypy.ini b/mypy.ini index 7532145ad1..924bc7c48b 100644 --- a/mypy.ini +++ b/mypy.ini @@ -10,12 +10,14 @@ warn_return_any=true namespace_packages=true warn_unused_ignores=true show_error_codes=true -exclude=tests/reflection/module_cases/* +#exclude=reflection/module_cases/* +exclude=docs/examples/archive/*|tests/reflection/module_cases/* [mypy-tests.*] disallow_untyped_defs=false warn_return_any=false + [mypy-docs.*] disallow_untyped_defs=false diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index 17f5b2694b..fc009d8444 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -1008,7 +1008,7 @@ def test_add_config_to_env(environment: Dict[str, str]) -> None: default="BUBA") ) add_config_to_env(c, ("dlt", )) - # must contain DLT prefix everywhere, INSTRUMENTED section taken from key and DLT_TEST taken from password + # must contain dlt prefix everywhere, INSTRUMENTED section taken from key and DLT_TEST taken from password assert environment.items() >= { 'DLT__DEFAULT': 'BUBA', 'DLT__INSTRUMENTED__HEAD': 'h', 'DLT__INSTRUMENTED__TUBE': '["tu","u","be"]', 'DLT__INSTRUMENTED__HEELS': 'he', diff --git a/tests/utils.py b/tests/utils.py index c64cf0ed9d..bde02b3fbf 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -179,6 +179,14 @@ def skip_if_not_active(destination: str) -> None: if destination not in ACTIVE_DESTINATIONS: pytest.skip(f"{destination} not in ACTIVE_DESTINATIONS", allow_module_level=True) + +def is_running_in_github_fork() -> bool: + is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true" + head_ref = os.environ.get("GITHUB_HEAD_REF", "") + repo = os.environ.get("GITHUB_REPOSITORY", "") + return is_github_actions and ":" in head_ref and not head_ref.startswith(repo.split("/")[0]) + + skipifspawn = pytest.mark.skipif( multiprocessing.get_start_method() != "fork", reason="process fork not supported" ) @@ -194,3 +202,7 @@ def skip_if_not_active(destination: str) -> None: skipifwindows = pytest.mark.skipif( platform.system() == "Windows", reason="does not runs on windows" ) + +skipifgithubfork = pytest.mark.skipif( + is_running_in_github_fork(), reason="Skipping test because it runs on a PR coming from fork" +) \ No newline at end of file