From 8b4b47b4d1110b9fda18ee2aa6768e106e571d09 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Wed, 13 Mar 2024 05:03:22 +0000 Subject: [PATCH 01/19] Updated --- .../deploy-a-pipeline/deploy-with-dagster.md | 60 +++++++++++++++++++ docs/website/sidebars.js | 1 + 2 files changed, 61 insertions(+) create mode 100644 docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md new file mode 100644 index 0000000000..8c34f7ce9e --- /dev/null +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -0,0 +1,60 @@ +--- +title: Deploy with Dagster +description: How to deploy a pipeline with Dagster +keywords: [how to, deploy a pipeline, Dagster] +--- + +# Deploy with Dagster + + +## Introduction to Dagster + +Dagster is an orchestrator that's designed for developing and maintaining data assets, such as tables, data sets, machine learning models, and reports. It makes it easier for data engineers to create, test, deploy, and oversee data-related assets. Dagster ensures these processes are reliable and focuses on using software-defined assets (SDAs) to simplify complex data management, enhance the ability to reuse code and provide a better understanding of data. + +To learn more, please read Dagster’s [documentation.](https://docs.dagster.io/getting-started?_gl=1*19ikq9*_ga*NTMwNTUxNDAzLjE3MDg5Mjc4OTk.*_ga_84VRQZG7TV*MTcwOTkwNDY3MS4zLjEuMTcwOTkwNTYzNi41Ny4wLjA.*_gcl_au*OTM3OTU1ODMwLjE3MDg5Mjc5MDA.) + +## Dagster Cloud Features + +Dagster Cloud further enhances these features by providing an enterprise-level orchestration service with serverless or hybrid deployment options. It incorporates native branching and built-in CI/CD to prioritize the developer experience. It enables scalable, cost-effective operations without the hassle of infrastructure management. + +## Dagster deployment options: **Serverless versus Hybrid**: + +The *serverless* option fully hosts the orchestration engine, while the *hybrid* model offers flexibility to use your computing resources, with Dagster managing the control plane, reducing operational overhead, and ensuring security. + +For more, please visit [Dagster cloud.](https://dagster.io/cloud) + +## Using Dagster for Free + +Dagster offers a 30-day free trial during which you can explore its features, such as pipeline orchestration, data quality checks, and embedded ELTs. You can try Dagster using its open source or by signing up for the trial. + +## Building Data Pipelines with `dlt` + +`dlt` is an open-source Python library that allows you to declaratively load data sources into well-structured tables or datasets through automatic schema inference and evolution. It simplifies building data pipelines by providing functionality to support the entire extract and load process. + +How does `dlt` integrate with Dagster for pipeline orchestration? + +`dlt` integrates with Dagster for pipeline orchestration, providing a streamlined process for building, enhancing, and managing data pipelines. This enables developers to leverage `dlt`'s capabilities for handling data extraction and load and Dagster's orchestration features to efficiently manage and monitor data pipelines. + +Here’s a brief summary of how to orchestrate `dlt` pipeline on Dagster: + +1. Create a `dlt` pipeline. For instructions on creating a pipeline, please refer to the [documentation](https://dlthub.com/docs/walkthroughs/create-a-pipeline). +2. Set up a Dagster project, configure resources, and define the asset. For more information, please refer to [Dagster’s documentation.](https://docs.dagster.io/getting-started/quickstart) +3. Next, define Dagster definitions, start the web server, and materialize the asset. +4. View the populated metadata and data in the configured destination. + +To do a hands-on project about “Orchestrating unstructured data pipelines with dagster and `dlt`," please read the following [article](https://dagster.io/blog/dagster-dlt). Here, the author has given a detailed overview and steps to ingest GitHub issue data from a repository and store the data in BigQuery. To build your pipelines, you can employ a similar approach. + +## Additional Resources + +- A general configurable `dlt` resource orchestrated on Dagster: [dlt resource](https://github.com/dagster-io/dagster-open-platform/blob/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/resources/dlt_resource.py#L29). +- `dlt` pipelines configured for Dagster: [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). + +:::note +These are external repositories and are subject to change. +::: + +## Conclusion + +In conclusion, integrating `dlt` within the data pipeline ecosystem significantly enhances the efficiency and manageability of data operations. The synergy between `dlt` and Dagster simplifies the development of data pipelines and ensures that data assets are more maintainable and scalable over time. `dlt` offers plenty of [verified sources](https://dlthub.com/docs/dlt-ecosystem/verified-sources/) that can be orchestrated on Dagster in a simplified way and can be easily managed, customized and maintained. + +We encourage data engineers and developers to explore the capabilities of `dlt` within the Dagster platform. Leveraging `dlt` on Dagster streamlines the pipeline development process and unlocks the potential for greater insights and value from your data assets. \ No newline at end of file diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 821a1affad..bf45076047 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -214,6 +214,7 @@ const sidebars = { 'reference/explainers/airflow-gcp-cloud-composer', 'walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions', 'walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook', + 'walkthroughs/deploy-a-pipeline/deploy-with-dagster', ] }, { From 2a3ea847244081cfd8f11495d948787664b2ec14 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Wed, 13 Mar 2024 05:07:48 +0000 Subject: [PATCH 02/19] Update --- .../walkthroughs/deploy-a-pipeline/deploy-with-dagster.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index 8c34f7ce9e..ba0304dac3 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -13,17 +13,17 @@ Dagster is an orchestrator that's designed for developing and maintaining data a To learn more, please read Dagster’s [documentation.](https://docs.dagster.io/getting-started?_gl=1*19ikq9*_ga*NTMwNTUxNDAzLjE3MDg5Mjc4OTk.*_ga_84VRQZG7TV*MTcwOTkwNDY3MS4zLjEuMTcwOTkwNTYzNi41Ny4wLjA.*_gcl_au*OTM3OTU1ODMwLjE3MDg5Mjc5MDA.) -## Dagster Cloud Features +### Dagster Cloud Features Dagster Cloud further enhances these features by providing an enterprise-level orchestration service with serverless or hybrid deployment options. It incorporates native branching and built-in CI/CD to prioritize the developer experience. It enables scalable, cost-effective operations without the hassle of infrastructure management. -## Dagster deployment options: **Serverless versus Hybrid**: +### Dagster deployment options: **Serverless versus Hybrid**: The *serverless* option fully hosts the orchestration engine, while the *hybrid* model offers flexibility to use your computing resources, with Dagster managing the control plane, reducing operational overhead, and ensuring security. For more, please visit [Dagster cloud.](https://dagster.io/cloud) -## Using Dagster for Free +### Using Dagster for Free Dagster offers a 30-day free trial during which you can explore its features, such as pipeline orchestration, data quality checks, and embedded ELTs. You can try Dagster using its open source or by signing up for the trial. @@ -44,7 +44,7 @@ Here’s a brief summary of how to orchestrate `dlt` pipeline on Dagster: To do a hands-on project about “Orchestrating unstructured data pipelines with dagster and `dlt`," please read the following [article](https://dagster.io/blog/dagster-dlt). Here, the author has given a detailed overview and steps to ingest GitHub issue data from a repository and store the data in BigQuery. To build your pipelines, you can employ a similar approach. -## Additional Resources +### Additional Resources - A general configurable `dlt` resource orchestrated on Dagster: [dlt resource](https://github.com/dagster-io/dagster-open-platform/blob/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/resources/dlt_resource.py#L29). - `dlt` pipelines configured for Dagster: [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). From 501ae52790486162654978df38869dfcd1259e47 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Mon, 18 Mar 2024 06:13:41 +0000 Subject: [PATCH 03/19] Updated --- .../deploy-a-pipeline/deploy-with-dagster.md | 35 +++++++++++-------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index ba0304dac3..f5ceefbc01 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -9,19 +9,19 @@ keywords: [how to, deploy a pipeline, Dagster] ## Introduction to Dagster -Dagster is an orchestrator that's designed for developing and maintaining data assets, such as tables, data sets, machine learning models, and reports. It makes it easier for data engineers to create, test, deploy, and oversee data-related assets. Dagster ensures these processes are reliable and focuses on using software-defined assets (SDAs) to simplify complex data management, enhance the ability to reuse code and provide a better understanding of data. +Dagster is an orchestrator that's designed for developing and maintaining data assets, such as tables, data sets, machine learning models, and reports. Dagster ensures these processes are reliable and focuses on using software-defined assets (SDAs) to simplify complex data management, enhance the ability to reuse code and provide a better understanding of data. -To learn more, please read Dagster’s [documentation.](https://docs.dagster.io/getting-started?_gl=1*19ikq9*_ga*NTMwNTUxNDAzLjE3MDg5Mjc4OTk.*_ga_84VRQZG7TV*MTcwOTkwNDY3MS4zLjEuMTcwOTkwNTYzNi41Ny4wLjA.*_gcl_au*OTM3OTU1ODMwLjE3MDg5Mjc5MDA.) +To read more, please refer to Dagster’s [documentation.](https://docs.dagster.io/getting-started?_gl=1*19ikq9*_ga*NTMwNTUxNDAzLjE3MDg5Mjc4OTk.*_ga_84VRQZG7TV*MTcwOTkwNDY3MS4zLjEuMTcwOTkwNTYzNi41Ny4wLjA.*_gcl_au*OTM3OTU1ODMwLjE3MDg5Mjc5MDA.) ### Dagster Cloud Features Dagster Cloud further enhances these features by providing an enterprise-level orchestration service with serverless or hybrid deployment options. It incorporates native branching and built-in CI/CD to prioritize the developer experience. It enables scalable, cost-effective operations without the hassle of infrastructure management. -### Dagster deployment options: **Serverless versus Hybrid**: +### Dagster deployment options: **Serverless** versus **Hybrid**: -The *serverless* option fully hosts the orchestration engine, while the *hybrid* model offers flexibility to use your computing resources, with Dagster managing the control plane, reducing operational overhead, and ensuring security. +The *serverless* option fully hosts the orchestration engine, while the *hybrid* model offers flexibility to use your computing resources, with Dagster managing the control plane. Reducing operational overhead and ensuring security. -For more, please visit [Dagster cloud.](https://dagster.io/cloud) +For more info, please [refer.](https://dagster.io/cloud) ### Using Dagster for Free @@ -37,24 +37,29 @@ How does `dlt` integrate with Dagster for pipeline orchestration? Here’s a brief summary of how to orchestrate `dlt` pipeline on Dagster: -1. Create a `dlt` pipeline. For instructions on creating a pipeline, please refer to the [documentation](https://dlthub.com/docs/walkthroughs/create-a-pipeline). -2. Set up a Dagster project, configure resources, and define the asset. For more information, please refer to [Dagster’s documentation.](https://docs.dagster.io/getting-started/quickstart) -3. Next, define Dagster definitions, start the web server, and materialize the asset. -4. View the populated metadata and data in the configured destination. +1. Create a `dlt` pipeline. For detailed instructions on creating a pipeline, please refer to the +[documentation](https://dlthub.com/docs/walkthroughs/create-a-pipeline). -To do a hands-on project about “Orchestrating unstructured data pipelines with dagster and `dlt`," please read the following [article](https://dagster.io/blog/dagster-dlt). Here, the author has given a detailed overview and steps to ingest GitHub issue data from a repository and store the data in BigQuery. To build your pipelines, you can employ a similar approach. +1. Set up a Dagster project, configure resources, and define the asset. For more information, please refer to [Dagster’s documentation.](https://docs.dagster.io/getting-started/quickstart) + +1. Next, define Dagster definitions, start the web server, and materialize the asset. +1. View the populated metadata and data in the configured destination. + +:::info +For a hands-on project on “Orchestrating unstructured data pipelines with dagster and dlt", read the [article]((https://dagster.io/blog/dagster-dlt)) provided. The author offers a detailed overview and steps for ingesting GitHub issue data from a repository and storing it in BigQuery. You can use a similar approach to build your pipelines. +::: ### Additional Resources - A general configurable `dlt` resource orchestrated on Dagster: [dlt resource](https://github.com/dagster-io/dagster-open-platform/blob/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/resources/dlt_resource.py#L29). - `dlt` pipelines configured for Dagster: [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). -:::note -These are external repositories and are subject to change. -::: + :::note + These are external repositories and are subject to change. + ::: ## Conclusion -In conclusion, integrating `dlt` within the data pipeline ecosystem significantly enhances the efficiency and manageability of data operations. The synergy between `dlt` and Dagster simplifies the development of data pipelines and ensures that data assets are more maintainable and scalable over time. `dlt` offers plenty of [verified sources](https://dlthub.com/docs/dlt-ecosystem/verified-sources/) that can be orchestrated on Dagster in a simplified way and can be easily managed, customized and maintained. +In conclusion, integrating `dlt` into the data pipeline ecosystem markedly improves data operations' efficiency and manageability. The combination of `dlt` and Dagster eases the development of data pipelines, making data assets more maintainable and scalable over time. With a wealth of [verified sources](/docs/website/docs/dlt-ecosystem/verified-sources/) available, `dlt` enables streamlined orchestration on Dagster, offering easy management, customization, and maintenance. -We encourage data engineers and developers to explore the capabilities of `dlt` within the Dagster platform. Leveraging `dlt` on Dagster streamlines the pipeline development process and unlocks the potential for greater insights and value from your data assets. \ No newline at end of file +We encourage data engineers and developers to explore the capabilities of `dlt` within the Dagster platform. By levraging `dlt` on Dagster, you can simplify the pipeline development process and gain greater insights and value from your data assets. \ No newline at end of file From ea7470544c6efad4011b398ac12c606a171291fa Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Mon, 18 Mar 2024 06:19:09 +0000 Subject: [PATCH 04/19] Updated --- .../docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index f5ceefbc01..06e570f447 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -60,6 +60,6 @@ For a hands-on project on “Orchestrating unstructured data pipelines with dags ## Conclusion -In conclusion, integrating `dlt` into the data pipeline ecosystem markedly improves data operations' efficiency and manageability. The combination of `dlt` and Dagster eases the development of data pipelines, making data assets more maintainable and scalable over time. With a wealth of [verified sources](/docs/website/docs/dlt-ecosystem/verified-sources/) available, `dlt` enables streamlined orchestration on Dagster, offering easy management, customization, and maintenance. +In conclusion, integrating `dlt` into the data pipeline ecosystem markedly improves data operations' efficiency and manageability. The combination of `dlt` and Dagster eases the development of data pipelines, making data assets more maintainable and scalable over time. With a wealth of [verified sources](https://dlthub.com/docs/dlt-ecosystem/verified-sources/) available, `dlt` enables streamlined orchestration on Dagster, offering easy management, customization, and maintenance. We encourage data engineers and developers to explore the capabilities of `dlt` within the Dagster platform. By levraging `dlt` on Dagster, you can simplify the pipeline development process and gain greater insights and value from your data assets. \ No newline at end of file From b4b65cb878ef735389ad7fc34aa731e400e83b14 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Mon, 18 Mar 2024 06:25:26 +0000 Subject: [PATCH 05/19] Updated --- .../docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index 06e570f447..c621b940e8 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -46,7 +46,7 @@ Here’s a brief summary of how to orchestrate `dlt` pipeline on Dagster: 1. View the populated metadata and data in the configured destination. :::info -For a hands-on project on “Orchestrating unstructured data pipelines with dagster and dlt", read the [article]((https://dagster.io/blog/dagster-dlt)) provided. The author offers a detailed overview and steps for ingesting GitHub issue data from a repository and storing it in BigQuery. You can use a similar approach to build your pipelines. +For a hands-on project on “Orchestrating unstructured data pipelines with dagster and dlt", read the [article](https://dagster.io/blog/dagster-dlt) provided. The author offers a detailed overview and steps for ingesting GitHub issue data from a repository and storing it in BigQuery. You can use a similar approach to build your pipelines. ::: ### Additional Resources From 12396d2527428034487ee1ddde1c217da56fb3ec Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Mon, 18 Mar 2024 06:29:34 +0000 Subject: [PATCH 06/19] Updated --- .../walkthroughs/deploy-a-pipeline/deploy-with-dagster.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index c621b940e8..6c56c50ee5 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -54,9 +54,9 @@ For a hands-on project on “Orchestrating unstructured data pipelines with dags - A general configurable `dlt` resource orchestrated on Dagster: [dlt resource](https://github.com/dagster-io/dagster-open-platform/blob/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/resources/dlt_resource.py#L29). - `dlt` pipelines configured for Dagster: [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). - :::note - These are external repositories and are subject to change. - ::: +:::note +These are external repositories and are subject to change. +::: ## Conclusion From 5d3b16f29484f9a2bfa3cba99fe1c46d479c66ec Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Mon, 18 Mar 2024 06:36:21 +0000 Subject: [PATCH 07/19] Updated --- .../walkthroughs/deploy-a-pipeline/deploy-with-dagster.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index 6c56c50ee5..5f783fda4a 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -57,9 +57,3 @@ For a hands-on project on “Orchestrating unstructured data pipelines with dags :::note These are external repositories and are subject to change. ::: - -## Conclusion - -In conclusion, integrating `dlt` into the data pipeline ecosystem markedly improves data operations' efficiency and manageability. The combination of `dlt` and Dagster eases the development of data pipelines, making data assets more maintainable and scalable over time. With a wealth of [verified sources](https://dlthub.com/docs/dlt-ecosystem/verified-sources/) available, `dlt` enables streamlined orchestration on Dagster, offering easy management, customization, and maintenance. - -We encourage data engineers and developers to explore the capabilities of `dlt` within the Dagster platform. By levraging `dlt` on Dagster, you can simplify the pipeline development process and gain greater insights and value from your data assets. \ No newline at end of file From e56528ba667eff1a1226f5d3f04ce7c2cf8ad8c3 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Thu, 21 Mar 2024 01:36:51 +0000 Subject: [PATCH 08/19] Updated --- .../deploy-a-pipeline/deploy-with-dagster.md | 61 +++++++++++++++++-- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index 5f783fda4a..6dd8e9ec92 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -15,7 +15,7 @@ To read more, please refer to Dagster’s [documentation.](https://docs.dagster. ### Dagster Cloud Features -Dagster Cloud further enhances these features by providing an enterprise-level orchestration service with serverless or hybrid deployment options. It incorporates native branching and built-in CI/CD to prioritize the developer experience. It enables scalable, cost-effective operations without the hassle of infrastructure management. +Dagster Cloud offers enterprise-level orchestration service with serverless or hybrid deployment options. It incorporates native branching and built-in CI/CD to prioritize the developer experience. It enables scalable, cost-effective operations without the hassle of infrastructure management. ### Dagster deployment options: **Serverless** versus **Hybrid**: @@ -35,25 +35,74 @@ How does `dlt` integrate with Dagster for pipeline orchestration? `dlt` integrates with Dagster for pipeline orchestration, providing a streamlined process for building, enhancing, and managing data pipelines. This enables developers to leverage `dlt`'s capabilities for handling data extraction and load and Dagster's orchestration features to efficiently manage and monitor data pipelines. -Here’s a brief summary of how to orchestrate `dlt` pipeline on Dagster: +Here's a concise guide to orchestrating a `dlt` pipeline with Dagster, using the project "Ingesting GitHub issue data from a repository and storing it in BigQuery" as an example, detailed in the article [“Orchestrating unstructured data pipelines with dagster and dlt."](https://dagster.io/blog/dagster-dlt) 1. Create a `dlt` pipeline. For detailed instructions on creating a pipeline, please refer to the [documentation](https://dlthub.com/docs/walkthroughs/create-a-pipeline). -1. Set up a Dagster project, configure resources, and define the asset. For more information, please refer to [Dagster’s documentation.](https://docs.dagster.io/getting-started/quickstart) +1. Set up a Dagster project, configure resources, and define the asset as follows: + 1. To create a Dagster project: + ```sh + mkdir dagster_github_issues + cd dagster_github_issues + dagster project scaffold --name github-issues + ``` + 1. Define `dlt` as a Dagster resource: + ```py + from dagster import ConfigurableResource + from dagster import ConfigurableResource + import dlt + + class DltResource(ConfigurableResource): + pipeline_name: str + dataset_name: str + destination: str + + def create_pipeline(self, resource_data, table_name): + + # configure the pipeline with your destination details + pipeline = dlt.pipeline( + pipeline_name=self.pipeline_name, + destination=self.destination, + dataset_name=self.dataset_name + ) + + # run the pipeline with your parameters + load_info = pipeline.run(resource_data, table_name=table_name) + + return load_info + ``` + 1. Define the Asset: + ```py + @asset + def issues_pipeline(pipeline: DltResource): + + logger = get_dagster_logger() + results = pipeline.create_pipeline(github_issues_resource, table_name='github_issues') + logger.info(results) + ``` + >For more information, please refer to [Dagster’s documentation.](https://docs.dagster.io/getting-started/quickstart) 1. Next, define Dagster definitions, start the web server, and materialize the asset. + 1. Start the webserver: + ```sh + dagster dev + ``` 1. View the populated metadata and data in the configured destination. :::info -For a hands-on project on “Orchestrating unstructured data pipelines with dagster and dlt", read the [article](https://dagster.io/blog/dagster-dlt) provided. The author offers a detailed overview and steps for ingesting GitHub issue data from a repository and storing it in BigQuery. You can use a similar approach to build your pipelines. +For a hands-on project on “Orchestrating unstructured data pipelines with dagster and `dlt`", read the [article](https://dagster.io/blog/dagster-dlt) provided. The author offers a detailed overview and steps for ingesting GitHub issue data from a repository and storing it in BigQuery. You can use a similar approach to build your pipelines. ::: ### Additional Resources + - A general configurable `dlt` resource orchestrated on Dagster: [dlt resource](https://github.com/dagster-io/dagster-open-platform/blob/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/resources/dlt_resource.py#L29). -- `dlt` pipelines configured for Dagster: [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). +- Configure `dlt` pipelines for Dagster: [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). +- Configure MongoDB source as an Asset factory: + >Dagster provides the feature of [@multi_asset](https://github.com/dlt-hub/dlt-dagster-demo/blob/21a8d18b6f0424f40f2eed5030989306af8b8edb/mongodb_dlt/mongodb_dlt/assets/__init__.py#L18) declaration that will allow us to convert each collection under a database into a separate asset. This will make our pipeline easy to debug in case of failure and the collections independent of each other. + :::note These are external repositories and are subject to change. -::: +::: \ No newline at end of file From 1ef9a1ec3824331c93da75d6198565955060c75c Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Fri, 22 Mar 2024 06:10:11 +0000 Subject: [PATCH 09/19] Updated --- .../deploy-a-pipeline/deploy-with-dagster.md | 164 ++++++++++-------- 1 file changed, 94 insertions(+), 70 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index 6dd8e9ec92..2e3f096f21 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -6,103 +6,127 @@ keywords: [how to, deploy a pipeline, Dagster] # Deploy with Dagster - ## Introduction to Dagster -Dagster is an orchestrator that's designed for developing and maintaining data assets, such as tables, data sets, machine learning models, and reports. Dagster ensures these processes are reliable and focuses on using software-defined assets (SDAs) to simplify complex data management, enhance the ability to reuse code and provide a better understanding of data. +Dagster is an orchestrator that's designed for developing and maintaining data assets, such as +tables, data sets, machine learning models, and reports. Dagster ensures these processes are +reliable and focuses on using software-defined assets (SDAs) to simplify complex data management, +enhance the ability to reuse code and provide a better understanding of data. -To read more, please refer to Dagster’s [documentation.](https://docs.dagster.io/getting-started?_gl=1*19ikq9*_ga*NTMwNTUxNDAzLjE3MDg5Mjc4OTk.*_ga_84VRQZG7TV*MTcwOTkwNDY3MS4zLjEuMTcwOTkwNTYzNi41Ny4wLjA.*_gcl_au*OTM3OTU1ODMwLjE3MDg5Mjc5MDA.) +To read more, please refer to Dagster’s +[documentation.](https://docs.dagster.io/getting-started?_gl=1*19ikq9*_ga*NTMwNTUxNDAzLjE3MDg5Mjc4OTk.*_ga_84VRQZG7TV*MTcwOTkwNDY3MS4zLjEuMTcwOTkwNTYzNi41Ny4wLjA.*_gcl_au*OTM3OTU1ODMwLjE3MDg5Mjc5MDA.) ### Dagster Cloud Features -Dagster Cloud offers enterprise-level orchestration service with serverless or hybrid deployment options. It incorporates native branching and built-in CI/CD to prioritize the developer experience. It enables scalable, cost-effective operations without the hassle of infrastructure management. +Dagster Cloud offers enterprise-level orchestration service with serverless or hybrid deployment +options. It incorporates native branching and built-in CI/CD to prioritize the developer experience. +It enables scalable, cost-effective operations without the hassle of infrastructure management. ### Dagster deployment options: **Serverless** versus **Hybrid**: -The *serverless* option fully hosts the orchestration engine, while the *hybrid* model offers flexibility to use your computing resources, with Dagster managing the control plane. Reducing operational overhead and ensuring security. +The *serverless* option fully hosts the orchestration engine, while the *hybrid* model offers +flexibility to use your computing resources, with Dagster managing the control plane. Reducing +operational overhead and ensuring security. For more info, please [refer.](https://dagster.io/cloud) ### Using Dagster for Free -Dagster offers a 30-day free trial during which you can explore its features, such as pipeline orchestration, data quality checks, and embedded ELTs. You can try Dagster using its open source or by signing up for the trial. +Dagster offers a 30-day free trial during which you can explore its features, such as pipeline +orchestration, data quality checks, and embedded ELTs. You can try Dagster using its open source or +by signing up for the trial. ## Building Data Pipelines with `dlt` -`dlt` is an open-source Python library that allows you to declaratively load data sources into well-structured tables or datasets through automatic schema inference and evolution. It simplifies building data pipelines by providing functionality to support the entire extract and load process. +`dlt` is an open-source Python library that allows you to declaratively load data sources into +well-structured tables or datasets through automatic schema inference and evolution. It simplifies +building data pipelines by providing functionality to support the entire extract and load process. How does `dlt` integrate with Dagster for pipeline orchestration? -`dlt` integrates with Dagster for pipeline orchestration, providing a streamlined process for building, enhancing, and managing data pipelines. This enables developers to leverage `dlt`'s capabilities for handling data extraction and load and Dagster's orchestration features to efficiently manage and monitor data pipelines. - -Here's a concise guide to orchestrating a `dlt` pipeline with Dagster, using the project "Ingesting GitHub issue data from a repository and storing it in BigQuery" as an example, detailed in the article [“Orchestrating unstructured data pipelines with dagster and dlt."](https://dagster.io/blog/dagster-dlt) - -1. Create a `dlt` pipeline. For detailed instructions on creating a pipeline, please refer to the -[documentation](https://dlthub.com/docs/walkthroughs/create-a-pipeline). - -1. Set up a Dagster project, configure resources, and define the asset as follows: - 1. To create a Dagster project: - ```sh - mkdir dagster_github_issues - cd dagster_github_issues - dagster project scaffold --name github-issues - ``` - 1. Define `dlt` as a Dagster resource: - ```py - from dagster import ConfigurableResource - from dagster import ConfigurableResource - import dlt - - class DltResource(ConfigurableResource): - pipeline_name: str - dataset_name: str - destination: str - - def create_pipeline(self, resource_data, table_name): - - # configure the pipeline with your destination details - pipeline = dlt.pipeline( - pipeline_name=self.pipeline_name, - destination=self.destination, - dataset_name=self.dataset_name - ) - - # run the pipeline with your parameters - load_info = pipeline.run(resource_data, table_name=table_name) - - return load_info - ``` - 1. Define the Asset: - ```py - @asset - def issues_pipeline(pipeline: DltResource): - - logger = get_dagster_logger() - results = pipeline.create_pipeline(github_issues_resource, table_name='github_issues') - logger.info(results) - ``` - >For more information, please refer to [Dagster’s documentation.](https://docs.dagster.io/getting-started/quickstart) +`dlt` integrates with Dagster for pipeline orchestration, providing a streamlined process for +building, enhancing, and managing data pipelines. This enables developers to leverage `dlt`'s +capabilities for handling data extraction and load and Dagster's orchestration features to +efficiently manage and monitor data pipelines. + +Here's a concise guide to orchestrating a `dlt` pipeline with Dagster, using the project "Ingesting +GitHub issue data from a repository and storing it in BigQuery" as an example, detailed in the +article +[“Orchestrating unstructured data pipelines with dagster and dlt."](https://dagster.io/blog/dagster-dlt) + +1. Create a `dlt` pipeline. For detailed instructions on creating a pipeline, please refer to the + [documentation](https://dlthub.com/docs/walkthroughs/create-a-pipeline). + +1. Set up a Dagster project, configure resources, and define the asset as follows: + + 1. To create a Dagster project: + ```sh + mkdir dagster_github_issues + cd dagster_github_issues + dagster project scaffold --name github-issues + ``` + 1. Define `dlt` as a Dagster resource: + ```py + from dagster import ConfigurableResource + from dagster import ConfigurableResource + import dlt + + class DltResource(ConfigurableResource): + pipeline_name: str + dataset_name: str + destination: str + + def create_pipeline(self, resource_data, table_name): + + # configure the pipeline with your destination details + pipeline = dlt.pipeline( + pipeline_name=self.pipeline_name, + destination=self.destination, + dataset_name=self.dataset_name + ) + + # run the pipeline with your parameters + load_info = pipeline.run(resource_data, table_name=table_name) + + return load_info + ``` + 1. Define the Asset: + ```py + @asset + def issues_pipeline(pipeline: DltResource): + + logger = get_dagster_logger() + results = pipeline.create_pipeline(github_issues_resource, table_name='github_issues') + logger.info(results) + ``` + > For more information, please refer to + > [Dagster’s documentation.](https://docs.dagster.io/getting-started/quickstart) 1. Next, define Dagster definitions, start the web server, and materialize the asset. + 1. Start the webserver: ```sh dagster dev - ``` + ``` + 1. View the populated metadata and data in the configured destination. -:::info -For a hands-on project on “Orchestrating unstructured data pipelines with dagster and `dlt`", read the [article](https://dagster.io/blog/dagster-dlt) provided. The author offers a detailed overview and steps for ingesting GitHub issue data from a repository and storing it in BigQuery. You can use a similar approach to build your pipelines. -::: +:::info For a hands-on project on “Orchestrating unstructured data pipelines with dagster and +`dlt`", read the [article](https://dagster.io/blog/dagster-dlt) provided. The author offers a +detailed overview and steps for ingesting GitHub issue data from a repository and storing it in +BigQuery. You can use a similar approach to build your pipelines. ::: ### Additional Resources - -- A general configurable `dlt` resource orchestrated on Dagster: [dlt resource](https://github.com/dagster-io/dagster-open-platform/blob/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/resources/dlt_resource.py#L29). -- Configure `dlt` pipelines for Dagster: [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). -- Configure MongoDB source as an Asset factory: - >Dagster provides the feature of [@multi_asset](https://github.com/dlt-hub/dlt-dagster-demo/blob/21a8d18b6f0424f40f2eed5030989306af8b8edb/mongodb_dlt/mongodb_dlt/assets/__init__.py#L18) declaration that will allow us to convert each collection under a database into a separate asset. This will make our pipeline easy to debug in case of failure and the collections independent of each other. - - -:::note -These are external repositories and are subject to change. -::: \ No newline at end of file +- A general configurable `dlt` resource orchestrated on Dagster: + [dlt resource](https://github.com/dagster-io/dagster-open-platform/blob/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/resources/dlt_resource.py#L29). +- Configure `dlt` pipelines for Dagster: + [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). +- Configure MongoDB source as an Asset factory: + > Dagster provides the feature of + > [@multi_asset](https://github.com/dlt-hub/dlt-dagster-demo/blob/21a8d18b6f0424f40f2eed5030989306af8b8edb/mongodb_dlt/mongodb_dlt/assets/__init__.py#L18) + > declaration that will allow us to convert each collection under a database into a separate + > asset. This will make our pipeline easy to debug in case of failure and the collections + > independent of each other. + +:::note These are external repositories and are subject to change. ::: From 246e6771f554a471e47c3566c3c4562c5914eded Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Tue, 26 Mar 2024 08:29:33 +0000 Subject: [PATCH 10/19] Updated deploy with dagster --- .../deploy-a-pipeline/deploy-with-dagster.md | 65 ++++++++++++++----- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index 2e3f096f21..615139ac32 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -39,23 +39,27 @@ by signing up for the trial. ## Building Data Pipelines with `dlt` `dlt` is an open-source Python library that allows you to declaratively load data sources into -well-structured tables or datasets through automatic schema inference and evolution. It simplifies -building data pipelines by providing functionality to support the entire extract and load process. +well-structured tables or datasets through automatic schema inference and evolution. It simplifies +building data pipelines with support for extract and load processes. -How does `dlt` integrate with Dagster for pipeline orchestration? +**How does `dlt` integrate with Dagster for pipeline orchestration?** `dlt` integrates with Dagster for pipeline orchestration, providing a streamlined process for building, enhancing, and managing data pipelines. This enables developers to leverage `dlt`'s capabilities for handling data extraction and load and Dagster's orchestration features to efficiently manage and monitor data pipelines. +### Orchestrating `dlt` pipeline on Dagster + Here's a concise guide to orchestrating a `dlt` pipeline with Dagster, using the project "Ingesting -GitHub issue data from a repository and storing it in BigQuery" as an example, detailed in the -article +GitHub issue data from a repository and storing it in BigQuery" as an example. + +More details can be found in the article [“Orchestrating unstructured data pipelines with dagster and dlt."](https://dagster.io/blog/dagster-dlt) -1. Create a `dlt` pipeline. For detailed instructions on creating a pipeline, please refer to the - [documentation](https://dlthub.com/docs/walkthroughs/create-a-pipeline). +**The steps are as follows:** +1. Create a `dlt` pipeline. For more, please refer to the documentation: +[Creating a pipeline.](https://dlthub.com/docs/walkthroughs/create-a-pipeline) 1. Set up a Dagster project, configure resources, and define the asset as follows: @@ -65,6 +69,7 @@ article cd dagster_github_issues dagster project scaffold --name github-issues ``` + 1. Define `dlt` as a Dagster resource: ```py from dagster import ConfigurableResource @@ -90,7 +95,7 @@ article return load_info ``` - 1. Define the Asset: + 1. Define the asset as: ```py @asset def issues_pipeline(pipeline: DltResource): @@ -102,26 +107,48 @@ article > For more information, please refer to > [Dagster’s documentation.](https://docs.dagster.io/getting-started/quickstart) -1. Next, define Dagster definitions, start the web server, and materialize the asset. - - 1. Start the webserver: - ```sh - dagster dev - ``` +1. Next, define Dagster definitions as follows: + ```py + all_assets = load_assets_from_modules([assets]) + simple_pipeline = define_asset_job(name="simple_pipeline", selection= ['issues_pipeline']) + + defs = Definitions( + assets=all_assets, + jobs=[simple_pipeline], + resources={ + "pipeline": DltResource( + pipeline_name = "github_issues", + dataset_name = "dagster_github_issues", + destination = "bigquery", + table_name= "github_issues" + ), + } + ) + ``` + +1. Finally, start the web server as: + + ```sh + dagster dev + ``` 1. View the populated metadata and data in the configured destination. -:::info For a hands-on project on “Orchestrating unstructured data pipelines with dagster and -`dlt`", read the [article](https://dagster.io/blog/dagster-dlt) provided. The author offers a +:::info +For the complete hands-on project on “Orchestrating unstructured data pipelines with dagster and +`dlt`", please refer to [article](https://dagster.io/blog/dagster-dlt). The author offers a detailed overview and steps for ingesting GitHub issue data from a repository and storing it in -BigQuery. You can use a similar approach to build your pipelines. ::: +BigQuery. You can use a similar approach to build your pipelines. +::: ### Additional Resources - A general configurable `dlt` resource orchestrated on Dagster: [dlt resource](https://github.com/dagster-io/dagster-open-platform/blob/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/resources/dlt_resource.py#L29). + - Configure `dlt` pipelines for Dagster: [dlt pipelines](https://github.com/dagster-io/dagster-open-platform/tree/5030ff6828e2b001a557c6864f279c3b476b0ca0/dagster_open_platform/assets/dlt_pipelines). + - Configure MongoDB source as an Asset factory: > Dagster provides the feature of > [@multi_asset](https://github.com/dlt-hub/dlt-dagster-demo/blob/21a8d18b6f0424f40f2eed5030989306af8b8edb/mongodb_dlt/mongodb_dlt/assets/__init__.py#L18) @@ -129,4 +156,6 @@ BigQuery. You can use a similar approach to build your pipelines. ::: > asset. This will make our pipeline easy to debug in case of failure and the collections > independent of each other. -:::note These are external repositories and are subject to change. ::: +:::note +These are external repositories and are subject to change. +::: From ba3085f8f10664f521584b2ddd75b4fceda7783d Mon Sep 17 00:00:00 2001 From: Zaeem Athar Date: Wed, 27 Mar 2024 13:13:42 +0100 Subject: [PATCH 11/19] Update deploy-with-dagster.md Fixing typos --- .../deploy-a-pipeline/deploy-with-dagster.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index 615139ac32..35578952c7 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -8,10 +8,10 @@ keywords: [how to, deploy a pipeline, Dagster] ## Introduction to Dagster -Dagster is an orchestrator that's designed for developing and maintaining data assets, such as +Dagster is an orchestrator designed for developing and maintaining data assets, such as tables, data sets, machine learning models, and reports. Dagster ensures these processes are reliable and focuses on using software-defined assets (SDAs) to simplify complex data management, -enhance the ability to reuse code and provide a better understanding of data. +enhance the ability to reuse code, and provide a better understanding of data. To read more, please refer to Dagster’s [documentation.](https://docs.dagster.io/getting-started?_gl=1*19ikq9*_ga*NTMwNTUxNDAzLjE3MDg5Mjc4OTk.*_ga_84VRQZG7TV*MTcwOTkwNDY3MS4zLjEuMTcwOTkwNTYzNi41Ny4wLjA.*_gcl_au*OTM3OTU1ODMwLjE3MDg5Mjc5MDA.) @@ -28,7 +28,7 @@ The *serverless* option fully hosts the orchestration engine, while the *hybrid* flexibility to use your computing resources, with Dagster managing the control plane. Reducing operational overhead and ensuring security. -For more info, please [refer.](https://dagster.io/cloud) +For more info, please refer to the Dagster Cloud [docs.](https://dagster.io/cloud) ### Using Dagster for Free @@ -46,13 +46,12 @@ building data pipelines with support for extract and load processes. `dlt` integrates with Dagster for pipeline orchestration, providing a streamlined process for building, enhancing, and managing data pipelines. This enables developers to leverage `dlt`'s -capabilities for handling data extraction and load and Dagster's orchestration features to -efficiently manage and monitor data pipelines. +capabilities for handling data extraction and load and Dagster's orchestration features to efficiently manage and monitor data pipelines. ### Orchestrating `dlt` pipeline on Dagster Here's a concise guide to orchestrating a `dlt` pipeline with Dagster, using the project "Ingesting -GitHub issue data from a repository and storing it in BigQuery" as an example. +GitHub issues data from a repository and storing it in BigQuery" as an example. More details can be found in the article [“Orchestrating unstructured data pipelines with dagster and dlt."](https://dagster.io/blog/dagster-dlt) @@ -132,8 +131,6 @@ More details can be found in the article dagster dev ``` -1. View the populated metadata and data in the configured destination. - :::info For the complete hands-on project on “Orchestrating unstructured data pipelines with dagster and `dlt`", please refer to [article](https://dagster.io/blog/dagster-dlt). The author offers a From 42c238ec1d14368a98d3e7480c646d81d57085c9 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Wed, 27 Mar 2024 13:01:25 +0000 Subject: [PATCH 12/19] Updated sidebars.js --- docs/website/sidebars.js | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 4fd6bfca6b..15c9c27512 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -219,6 +219,7 @@ const sidebars = { 'walkthroughs/deploy-a-pipeline/deploy-with-google-cloud-functions', 'walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook', 'walkthroughs/deploy-a-pipeline/deploy-with-kestra', + 'walkthroughs/deploy-a-pipeline/deploy-with-dagster', ] }, { From dbb5a0761850faa326dabc827d5a079af281a2bf Mon Sep 17 00:00:00 2001 From: Alexander Butler <41213451+z3z1ma@users.noreply.github.com> Date: Mon, 1 Apr 2024 18:44:01 +0100 Subject: [PATCH 13/19] fix: check for typeddict before class or subclass checks which fail (#1160) --- dlt/common/typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 05720fe7d9..99c2604cdf 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -243,7 +243,7 @@ def get_all_types_of_class_in_union(hint: Type[Any], cls: Type[TAny]) -> List[Ty return [ t for t in get_args(hint) - if inspect.isclass(t) and (issubclass(t, cls) or issubclass(cls, t)) + if not is_typeddict(t) and inspect.isclass(t) and (issubclass(t, cls) or issubclass(cls, t)) ] From 54f6a04d1cf9b7460f3137cfc884c93f72be7783 Mon Sep 17 00:00:00 2001 From: Zaeem Athar Date: Tue, 2 Apr 2024 09:55:18 +0200 Subject: [PATCH 14/19] Update deploy-with-dagster.md Adding import DltResource in the Definition script. --- .../docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index 35578952c7..03f45ababb 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -108,6 +108,8 @@ More details can be found in the article 1. Next, define Dagster definitions as follows: ```py + import DltResource + all_assets = load_assets_from_modules([assets]) simple_pipeline = define_asset_job(name="simple_pipeline", selection= ['issues_pipeline']) From 79e456dc4288a6908a89e47ba6125cb7e2d104c9 Mon Sep 17 00:00:00 2001 From: Zaeem Athar Date: Tue, 2 Apr 2024 10:53:06 +0200 Subject: [PATCH 15/19] Update deploy-with-dagster.md Removing DltResource args --- .../walkthroughs/deploy-a-pipeline/deploy-with-dagster.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index 03f45ababb..c901b38e00 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -117,12 +117,7 @@ More details can be found in the article assets=all_assets, jobs=[simple_pipeline], resources={ - "pipeline": DltResource( - pipeline_name = "github_issues", - dataset_name = "dagster_github_issues", - destination = "bigquery", - table_name= "github_issues" - ), + "pipeline": DltResource(), } ) ``` From 17c15aafaab19bae200499fa402aeeee58cff056 Mon Sep 17 00:00:00 2001 From: Zaeem Athar Date: Tue, 2 Apr 2024 12:13:29 +0200 Subject: [PATCH 16/19] Update deploy-with-dagster.md Changing resource name from DltResource to DltPipeline --- .../deploy-a-pipeline/deploy-with-dagster.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md index c901b38e00..cca882ba38 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md @@ -75,7 +75,7 @@ More details can be found in the article from dagster import ConfigurableResource import dlt - class DltResource(ConfigurableResource): + class DltPipeline(ConfigurableResource): pipeline_name: str dataset_name: str destination: str @@ -97,7 +97,7 @@ More details can be found in the article 1. Define the asset as: ```py @asset - def issues_pipeline(pipeline: DltResource): + def issues_pipeline(pipeline: DltPipeline): logger = get_dagster_logger() results = pipeline.create_pipeline(github_issues_resource, table_name='github_issues') @@ -108,8 +108,6 @@ More details can be found in the article 1. Next, define Dagster definitions as follows: ```py - import DltResource - all_assets = load_assets_from_modules([assets]) simple_pipeline = define_asset_job(name="simple_pipeline", selection= ['issues_pipeline']) @@ -117,7 +115,11 @@ More details can be found in the article assets=all_assets, jobs=[simple_pipeline], resources={ - "pipeline": DltResource(), + "pipeline": DltPipeline( + pipeline_name = "github_issues", + dataset_name = "dagster_github_issues", + destination = "bigquery", + ), } ) ``` From ecb5aa0015bf1e910a1c61d0992f5fca1e5f6514 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 2 Apr 2024 22:42:54 +0300 Subject: [PATCH 17/19] RESTClient: add support for relative next URLs in LinkPaginators (#1163) * Extend `mock_api_server()` to support relative next urls * Enhance BaseNextUrlPaginator to support relative next URLs in pagination --- dlt/sources/helpers/rest_client/paginators.py | 7 + tests/sources/helpers/rest_client/conftest.py | 88 +++++++---- .../helpers/rest_client/test_client.py | 17 +++ .../helpers/rest_client/test_paginators.py | 139 ++++++++++++++++-- 4 files changed, 209 insertions(+), 42 deletions(-) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index c098ea667f..48dfdf6e4f 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from typing import Optional +from urllib.parse import urlparse, urljoin from dlt.sources.helpers.requests import Response, Request from dlt.common import jsonpath @@ -102,6 +103,12 @@ def update_request(self, request: Request) -> None: class BaseNextUrlPaginator(BasePaginator): def update_request(self, request: Request) -> None: + # Handle relative URLs + if self.next_reference: + parsed_url = urlparse(self.next_reference) + if not parsed_url.scheme: + self.next_reference = urljoin(request.url, self.next_reference) + request.url = self.next_reference diff --git a/tests/sources/helpers/rest_client/conftest.py b/tests/sources/helpers/rest_client/conftest.py index 7eec090db6..cffce7cb07 100644 --- a/tests/sources/helpers/rest_client/conftest.py +++ b/tests/sources/helpers/rest_client/conftest.py @@ -1,5 +1,5 @@ import re -from typing import NamedTuple, Callable, Pattern, List, TYPE_CHECKING +from typing import NamedTuple, Callable, Pattern, List, Union, TYPE_CHECKING import base64 from urllib.parse import urlsplit, urlunsplit @@ -10,9 +10,13 @@ from dlt.common import json if TYPE_CHECKING: - RequestCallback = Callable[[requests_mock.Request, requests_mock.Context], str] + RequestCallback = Callable[ + [requests_mock.Request, requests_mock.Context], Union[str, dict, list] + ] + ResponseSerializer = Callable[[requests_mock.Request, requests_mock.Context], str] else: RequestCallback = Callable + ResponseSerializer = Callable MOCK_BASE_URL = "https://api.example.com" @@ -20,7 +24,7 @@ class Route(NamedTuple): method: str pattern: Pattern[str] - callback: RequestCallback + callback: ResponseSerializer class APIRouter: @@ -32,8 +36,17 @@ def _add_route( self, method: str, pattern: str, func: RequestCallback ) -> RequestCallback: compiled_pattern = re.compile(f"{self.base_url}{pattern}") - self.routes.append(Route(method, compiled_pattern, func)) - return func + + def serialize_response(request, context): + result = func(request, context) + + if isinstance(result, dict) or isinstance(result, list): + return json.dumps(result) + + return result + + self.routes.append(Route(method, compiled_pattern, serialize_response)) + return serialize_response def get(self, pattern: str) -> Callable[[RequestCallback], RequestCallback]: def decorator(func: RequestCallback) -> RequestCallback: @@ -59,9 +72,17 @@ def register_routes(self, mocker: requests_mock.Mocker) -> None: router = APIRouter(MOCK_BASE_URL) -def serialize_page(records, page_number, total_pages, base_url, records_key="data"): +def serialize_page( + records, + page_number, + total_pages, + request_url, + records_key="data", + use_absolute_url=True, +): + """Serialize a page of records into a dict with pagination metadata.""" if records_key is None: - return json.dumps(records) + return records response = { records_key: records, @@ -72,11 +93,15 @@ def serialize_page(records, page_number, total_pages, base_url, records_key="dat if page_number < total_pages: next_page = page_number + 1 - scheme, netloc, path, _, _ = urlsplit(base_url) - next_page = urlunsplit([scheme, netloc, path, f"page={next_page}", ""]) - response["next_page"] = next_page + scheme, netloc, path, _, _ = urlsplit(request_url) + if use_absolute_url: + next_page_url = urlunsplit([scheme, netloc, path, f"page={next_page}", ""]) + else: + next_page_url = f"{path}?page={next_page}" - return json.dumps(response) + response["next_page"] = next_page_url + + return response def generate_posts(count=100): @@ -91,7 +116,9 @@ def get_page_number(qs, key="page", default=1): return int(qs.get(key, [default])[0]) -def paginate_response(request, records, page_size=10, records_key="data"): +def paginate_response( + request, records, page_size=10, records_key="data", use_absolute_url=True +): page_number = get_page_number(request.qs) total_records = len(records) total_pages = (total_records + page_size - 1) // page_size @@ -99,7 +126,12 @@ def paginate_response(request, records, page_size=10, records_key="data"): end_index = start_index + 10 records_slice = records[start_index:end_index] return serialize_page( - records_slice, page_number, total_pages, request.url, records_key + records_slice, + page_number, + total_pages, + request.url, + records_key, + use_absolute_url, ) @@ -115,6 +147,10 @@ def posts_no_key(request, context): def posts(request, context): return paginate_response(request, generate_posts()) + @router.get(r"/posts_relative_next_url(\?page=\d+)?$") + def posts_relative_next_url(request, context): + return paginate_response(request, generate_posts(), use_absolute_url=False) + @router.get(r"/posts/(\d+)/comments") def post_comments(request, context): post_id = int(request.url.split("/")[-2]) @@ -123,17 +159,17 @@ def post_comments(request, context): @router.get(r"/posts/\d+$") def post_detail(request, context): post_id = request.url.split("/")[-1] - return json.dumps({"id": post_id, "body": f"Post body {post_id}"}) + return {"id": post_id, "body": f"Post body {post_id}"} @router.get(r"/posts/\d+/some_details_404") def post_detail_404(request, context): """Return 404 for post with id > 0. Used to test ignoring 404 errors.""" post_id = int(request.url.split("/")[-2]) if post_id < 1: - return json.dumps({"id": post_id, "body": f"Post body {post_id}"}) + return {"id": post_id, "body": f"Post body {post_id}"} else: context.status_code = 404 - return json.dumps({"error": "Post not found"}) + return {"error": "Post not found"} @router.get(r"/posts_under_a_different_key$") def posts_with_results_key(request, context): @@ -149,7 +185,7 @@ def protected_basic_auth(request, context): if auth == f"Basic {creds_base64}": return paginate_response(request, generate_posts()) context.status_code = 401 - return json.dumps({"error": "Unauthorized"}) + return {"error": "Unauthorized"} @router.get("/protected/posts/bearer-token") def protected_bearer_token(request, context): @@ -157,7 +193,7 @@ def protected_bearer_token(request, context): if auth == "Bearer test-token": return paginate_response(request, generate_posts()) context.status_code = 401 - return json.dumps({"error": "Unauthorized"}) + return {"error": "Unauthorized"} @router.get("/protected/posts/bearer-token-plain-text-error") def protected_bearer_token_plain_text_erorr(request, context): @@ -173,31 +209,27 @@ def protected_api_key(request, context): if api_key == "test-api-key": return paginate_response(request, generate_posts()) context.status_code = 401 - return json.dumps({"error": "Unauthorized"}) + return {"error": "Unauthorized"} @router.post("/oauth/token") def oauth_token(request, context): - return json.dumps( - { - "access_token": "test-token", - "expires_in": 3600, - } - ) + return {"access_token": "test-token", "expires_in": 3600} @router.post("/auth/refresh") def refresh_token(request, context): body = request.json() if body.get("refresh_token") == "valid-refresh-token": - return json.dumps({"access_token": "new-valid-token"}) + return {"access_token": "new-valid-token"} context.status_code = 401 - return json.dumps({"error": "Invalid refresh token"}) + return {"error": "Invalid refresh token"} router.register_routes(m) yield m -def assert_pagination(pages, expected_start=0, page_size=10): +def assert_pagination(pages, expected_start=0, page_size=10, total_pages=10): + assert len(pages) == total_pages for i, page in enumerate(pages): assert page == [ {"id": i, "title": f"Post {i}"} for i in range(i * 10, (i + 1) * 10) diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 7a4c55f9a6..88653efefe 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -74,6 +74,23 @@ def test_default_paginator(self, rest_client: RESTClient): assert_pagination(pages) + def test_excplicit_paginator(self, rest_client: RESTClient): + pages_iter = rest_client.paginate( + "/posts", paginator=JSONResponsePaginator(next_url_path="next_page") + ) + pages = list(pages_iter) + + assert_pagination(pages) + + def test_excplicit_paginator_relative_next_url(self, rest_client: RESTClient): + pages_iter = rest_client.paginate( + "/posts_relative_next_url", + paginator=JSONResponsePaginator(next_url_path="next_page"), + ) + pages = list(pages_iter) + + assert_pagination(pages) + def test_paginate_with_hooks(self, rest_client: RESTClient): def response_hook(response: Response, *args: Any, **kwargs: Any) -> None: if response.status_code == 404: diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index cc4dea65dc..bd38a2e421 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -1,7 +1,8 @@ -import pytest from unittest.mock import Mock -from requests.models import Response +import pytest + +from requests.models import Response, Request from dlt.sources.helpers.rest_client.paginators import ( SinglePagePaginator, @@ -29,21 +30,131 @@ def test_update_state_without_next(self): class TestJSONResponsePaginator: - def test_update_state_with_next(self): - paginator = JSONResponsePaginator() - response = Mock( - Response, json=lambda: {"next": "http://example.com/next", "results": []} - ) + @pytest.mark.parametrize( + "test_case", + [ + # Test with empty next_url_path, e.g. auto-detect + { + "next_url_path": None, + "response_json": {"next": "http://example.com/next", "results": []}, + "expected": { + "next_reference": "http://example.com/next", + "has_next_page": True, + }, + }, + # Test with explicit next_url_path + { + "next_url_path": "next_page", + "response_json": { + "next_page": "http://example.com/next", + "results": [], + }, + "expected": { + "next_reference": "http://example.com/next", + "has_next_page": True, + }, + }, + # Test with nested next_url_path + { + "next_url_path": "next_page.url", + "response_json": { + "next_page": {"url": "http://example.com/next"}, + "results": [], + }, + "expected": { + "next_reference": "http://example.com/next", + "has_next_page": True, + }, + }, + # Test without next_page + { + "next_url_path": None, + "response_json": {"results": []}, + "expected": { + "next_reference": None, + "has_next_page": False, + }, + }, + ], + ) + def test_update_state(self, test_case): + next_url_path = test_case["next_url_path"] + + if next_url_path is None: + paginator = JSONResponsePaginator() + else: + paginator = JSONResponsePaginator(next_url_path=next_url_path) + response = Mock(Response, json=lambda: test_case["response_json"]) paginator.update_state(response) - assert paginator.next_reference == "http://example.com/next" - assert paginator.has_next_page is True + assert paginator.next_reference == test_case["expected"]["next_reference"] + assert paginator.has_next_page == test_case["expected"]["has_next_page"] - def test_update_state_without_next(self): + # Test update_request from BaseNextUrlPaginator + @pytest.mark.parametrize( + "test_case", + [ + # Test with absolute URL + { + "next_reference": "http://example.com/api/resource?page=2", + "request_url": "http://example.com/api/resource", + "expected": "http://example.com/api/resource?page=2", + }, + # Test with relative URL + { + "next_reference": "/api/resource?page=2", + "request_url": "http://example.com/api/resource", + "expected": "http://example.com/api/resource?page=2", + }, + # Test with more nested path + { + "next_reference": "/api/resource/subresource?page=3&sort=desc", + "request_url": "http://example.com/api/resource/subresource", + "expected": "http://example.com/api/resource/subresource?page=3&sort=desc", + }, + # Test with 'page' in path + { + "next_reference": "/api/page/4/items?filter=active", + "request_url": "http://example.com/api/page/3/items", + "expected": "http://example.com/api/page/4/items?filter=active", + }, + # Test with complex query parameters + { + "next_reference": "/api/resource?page=3&category=books&sort=author", + "request_url": "http://example.com/api/resource?page=2", + "expected": "http://example.com/api/resource?page=3&category=books&sort=author", + }, + # Test with URL having port number + { + "next_reference": "/api/resource?page=2", + "request_url": "http://example.com:8080/api/resource", + "expected": "http://example.com:8080/api/resource?page=2", + }, + # Test with HTTPS protocol + { + "next_reference": "https://secure.example.com/api/resource?page=2", + "request_url": "https://secure.example.com/api/resource", + "expected": "https://secure.example.com/api/resource?page=2", + }, + # Test with encoded characters in URL + { + "next_reference": "/api/resource?page=2&query=%E3%81%82", + "request_url": "http://example.com/api/resource", + "expected": "http://example.com/api/resource?page=2&query=%E3%81%82", + }, + # Test with missing 'page' parameter in next_reference + { + "next_reference": "/api/resource?sort=asc", + "request_url": "http://example.com/api/resource?page=1", + "expected": "http://example.com/api/resource?sort=asc", + }, + ], + ) + def test_update_request(self, test_case): paginator = JSONResponsePaginator() - response = Mock(Response, json=lambda: {"results": []}) - paginator.update_state(response) - assert paginator.next_reference is None - assert paginator.has_next_page is False + paginator.next_reference = test_case["next_reference"] + request = Mock(Request, url=test_case["request_url"]) + paginator.update_request(request) + assert request.url == test_case["expected"] class TestSinglePagePaginator: From ee33548508b58fe4723f2c93eb5ac4ab301d20fe Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Wed, 3 Apr 2024 10:39:46 +0300 Subject: [PATCH 18/19] Ensure total count is an integer in OffsetPaginator (#1172) --- dlt/sources/helpers/rest_client/paginators.py | 8 ++++++++ .../sources/helpers/rest_client/test_paginators.py | 13 +++++++++++++ 2 files changed, 21 insertions(+) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 48dfdf6e4f..ce414322a0 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -88,6 +88,14 @@ def update_state(self, response: Response) -> None: f"Total count not found in response for {self.__class__.__name__}" ) + try: + total = int(total) + except ValueError: + raise ValueError( + f"Total count is not an integer in response for {self.__class__.__name__}. " + f"Expected an integer, got {total}" + ) + self.offset += self.limit if self.offset >= total: diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index bd38a2e421..4d086f1486 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -186,6 +186,19 @@ def test_update_state(self): paginator.update_state(response) assert paginator.has_next_page is False + def test_update_state_with_string_total(self): + paginator = OffsetPaginator(0, 10) + response = Mock(Response, json=lambda: {"total": "20"}) + paginator.update_state(response) + assert paginator.offset == 10 + assert paginator.has_next_page is True + + def test_update_state_with_invalid_total(self): + paginator = OffsetPaginator(0, 10) + response = Mock(Response, json=lambda: {"total": "invalid"}) + with pytest.raises(ValueError): + paginator.update_state(response) + def test_update_state_without_total(self): paginator = OffsetPaginator(0, 10) response = Mock(Response, json=lambda: {}) From 6bf19403417e35756edbae45c0c234ebc8b74367 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Wed, 3 Apr 2024 12:35:45 +0200 Subject: [PATCH 19/19] Docs: Simpler Examples, generate examples pages from actual examples code (#1134) * start migrating examples * restore chess dbt example * bring examples into desired shape * generate example pages from existing examples using docstrings * fix one md link * post merge file delete * add some notes for test vars * move chess example back into examples folder * skip examples without proper header * separate examples testing into own make command * prepare tests for examples and run them * fix examples test setup * add postgres dependency to snippets tests * ignore some folders * add argparse plus clear flag to example test preparation make examples raise in case of failed loads * simplify example folder skipping * add a template for a new example * fix bug in deployment * update contributing --- .github/workflows/test_doc_snippets.yml | 16 +- .gitignore | 6 +- Makefile | 14 +- .../.dlt/config.toml | 0 docs/examples/CONTRIBUTING.md | 51 ++--- .../chess_production => examples}/__init__.py | 0 .../.dlt/config.toml | 0 .../.dlt/example.secrets.toml} | 0 .../code => examples/_template}/__init__.py | 0 docs/examples/_template/_template.py | 30 +++ docs/examples/chess/chess.py | 9 +- .../chess_production/.dlt/config.toml | 1 + .../{chess.py => chess_production.py} | 53 +++-- docs/examples/conftest.py | 57 ++++++ .../connector_x_arrow/connector_x_arrow.py | 72 +++++++ docs/examples/connector_x_arrow/load_arrow.py | 41 ---- .../custom_destination_bigquery.py | 20 ++ .../google_sheets/.dlt/example.secrets.toml | 24 +-- docs/examples/google_sheets/google_sheets.py | 35 +++- .../.dlt/example.secrets.toml | 2 +- .../{zendesk.py => incremental_loading.py} | 29 +++ docs/examples/nested_data/.dlt/config.toml | 0 .../nested_data/.dlt/example.secrets.toml | 2 +- docs/examples/nested_data/nested_data.py | 39 ++++ .../assets/invoices/invoice_2.txt | 0 .../assets/invoices/invoice_20230831-p1.pdf | Bin .../pdf_to_weaviate/pdf_to_weaviate.py | 68 +++++-- .../qdrant_zendesk/.dlt/example.secrets.toml | 2 +- .../{qdrant.py => qdrant_zendesk.py} | 38 ++++ docs/examples/transformers/.dlt/config.toml | 2 +- .../{pokemon.py => transformers.py} | 30 ++- docs/tools/prepare_examples_tests.py | 76 +++++++ .../website/docs/examples/_examples-header.md | 21 -- .../chess_production/code/.dlt/config.toml | 2 - .../chess_production/code/chess-snippets.py | 171 ---------------- .../docs/examples/chess_production/index.md | 47 ----- .../examples/connector_x_arrow/__init__.py | 0 .../connector_x_arrow/code/__init__.py | 0 .../code/load_arrow-snippets.py | 52 ----- .../docs/examples/connector_x_arrow/index.md | 44 ---- .../custom_destination_bigquery/__init__.py | 0 .../code/.dlt/config.toml | 2 - .../code/.dlt/example.secrets.toml | 10 - .../code/__init__.py | 0 .../custom_destination_bigquery-snippets.py | 81 -------- .../custom_destination_bigquery/index.md | 30 --- .../docs/examples/google_sheets/__init__.py | 0 .../google_sheets/code/.dlt/config.toml | 2 - .../code/.dlt/example.secrets.toml | 18 -- .../examples/google_sheets/code/__init__.py | 0 .../code/google_sheets-snippets.py | 88 -------- .../docs/examples/google_sheets/index.md | 42 ---- .../examples/incremental_loading/__init__.py | 0 .../incremental_loading/code/.dlt/config.toml | 2 - .../code/.dlt/example.secrets.toml | 6 - .../incremental_loading/code/__init__.py | 0 .../code/zendesk-snippets.py | 143 ------------- .../examples/incremental_loading/index.md | 42 ---- .../docs/examples/nested_data/__init__.py | 0 .../nested_data/code/.dlt/config.toml | 2 - .../code/.dlt/example.secrets.toml | 4 - .../examples/nested_data/code/__init__.py | 0 .../nested_data/code/nested_data-snippets.py | 156 -------------- .../docs/examples/nested_data/index.md | 41 ---- .../docs/examples/pdf_to_weaviate/__init__.py | 0 .../examples/pdf_to_weaviate/code/__init__.py | 0 .../code/pdf_to_weaviate-snippets.py | 67 ------ .../docs/examples/pdf_to_weaviate/index.md | 56 ----- .../docs/examples/qdrant_zendesk/__init__.py | 0 .../code/.dlt/example.secrets.toml | 10 - .../examples/qdrant_zendesk/code/__init__.py | 0 .../qdrant_zendesk/code/qdrant-snippets.py | 191 ------------------ .../docs/examples/qdrant_zendesk/index.md | 87 -------- .../docs/examples/transformers/__init__.py | 0 .../transformers/code/.dlt/config.toml | 18 -- .../examples/transformers/code/__init__.py | 0 .../transformers/code/pokemon-snippets.py | 72 ------- .../docs/examples/transformers/index.md | 37 ---- docs/website/sidebars.js | 35 +++- docs/website/tools/preprocess_docs.js | 126 +++++++++--- 80 files changed, 699 insertions(+), 1723 deletions(-) rename docs/examples/{custom_destination_bigquery => }/.dlt/config.toml (100%) rename docs/{website/docs/examples/chess_production => examples}/__init__.py (100%) rename docs/examples/{google_sheets => _template}/.dlt/config.toml (100%) rename docs/examples/{incremental_loading/.dlt/config.toml => _template/.dlt/example.secrets.toml} (100%) rename docs/{website/docs/examples/chess_production/code => examples/_template}/__init__.py (100%) create mode 100644 docs/examples/_template/_template.py rename docs/examples/chess_production/{chess.py => chess_production.py} (84%) create mode 100644 docs/examples/conftest.py create mode 100644 docs/examples/connector_x_arrow/connector_x_arrow.py delete mode 100644 docs/examples/connector_x_arrow/load_arrow.py rename docs/examples/incremental_loading/{zendesk.py => incremental_loading.py} (74%) delete mode 100644 docs/examples/nested_data/.dlt/config.toml rename docs/{website/docs/examples/pdf_to_weaviate/code => examples/pdf_to_weaviate}/assets/invoices/invoice_2.txt (100%) rename docs/{website/docs/examples/pdf_to_weaviate/code => examples/pdf_to_weaviate}/assets/invoices/invoice_20230831-p1.pdf (100%) rename docs/examples/qdrant_zendesk/{qdrant.py => qdrant_zendesk.py} (79%) rename docs/examples/transformers/{pokemon.py => transformers.py} (64%) create mode 100644 docs/tools/prepare_examples_tests.py delete mode 100644 docs/website/docs/examples/_examples-header.md delete mode 100644 docs/website/docs/examples/chess_production/code/.dlt/config.toml delete mode 100644 docs/website/docs/examples/chess_production/code/chess-snippets.py delete mode 100644 docs/website/docs/examples/chess_production/index.md delete mode 100644 docs/website/docs/examples/connector_x_arrow/__init__.py delete mode 100644 docs/website/docs/examples/connector_x_arrow/code/__init__.py delete mode 100644 docs/website/docs/examples/connector_x_arrow/code/load_arrow-snippets.py delete mode 100644 docs/website/docs/examples/connector_x_arrow/index.md delete mode 100644 docs/website/docs/examples/custom_destination_bigquery/__init__.py delete mode 100644 docs/website/docs/examples/custom_destination_bigquery/code/.dlt/config.toml delete mode 100644 docs/website/docs/examples/custom_destination_bigquery/code/.dlt/example.secrets.toml delete mode 100644 docs/website/docs/examples/custom_destination_bigquery/code/__init__.py delete mode 100644 docs/website/docs/examples/custom_destination_bigquery/code/custom_destination_bigquery-snippets.py delete mode 100644 docs/website/docs/examples/custom_destination_bigquery/index.md delete mode 100644 docs/website/docs/examples/google_sheets/__init__.py delete mode 100644 docs/website/docs/examples/google_sheets/code/.dlt/config.toml delete mode 100644 docs/website/docs/examples/google_sheets/code/.dlt/example.secrets.toml delete mode 100644 docs/website/docs/examples/google_sheets/code/__init__.py delete mode 100644 docs/website/docs/examples/google_sheets/code/google_sheets-snippets.py delete mode 100644 docs/website/docs/examples/google_sheets/index.md delete mode 100644 docs/website/docs/examples/incremental_loading/__init__.py delete mode 100644 docs/website/docs/examples/incremental_loading/code/.dlt/config.toml delete mode 100644 docs/website/docs/examples/incremental_loading/code/.dlt/example.secrets.toml delete mode 100644 docs/website/docs/examples/incremental_loading/code/__init__.py delete mode 100644 docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py delete mode 100644 docs/website/docs/examples/incremental_loading/index.md delete mode 100644 docs/website/docs/examples/nested_data/__init__.py delete mode 100644 docs/website/docs/examples/nested_data/code/.dlt/config.toml delete mode 100644 docs/website/docs/examples/nested_data/code/.dlt/example.secrets.toml delete mode 100644 docs/website/docs/examples/nested_data/code/__init__.py delete mode 100644 docs/website/docs/examples/nested_data/code/nested_data-snippets.py delete mode 100644 docs/website/docs/examples/nested_data/index.md delete mode 100644 docs/website/docs/examples/pdf_to_weaviate/__init__.py delete mode 100644 docs/website/docs/examples/pdf_to_weaviate/code/__init__.py delete mode 100644 docs/website/docs/examples/pdf_to_weaviate/code/pdf_to_weaviate-snippets.py delete mode 100644 docs/website/docs/examples/pdf_to_weaviate/index.md delete mode 100644 docs/website/docs/examples/qdrant_zendesk/__init__.py delete mode 100644 docs/website/docs/examples/qdrant_zendesk/code/.dlt/example.secrets.toml delete mode 100644 docs/website/docs/examples/qdrant_zendesk/code/__init__.py delete mode 100644 docs/website/docs/examples/qdrant_zendesk/code/qdrant-snippets.py delete mode 100644 docs/website/docs/examples/qdrant_zendesk/index.md delete mode 100644 docs/website/docs/examples/transformers/__init__.py delete mode 100644 docs/website/docs/examples/transformers/code/.dlt/config.toml delete mode 100644 docs/website/docs/examples/transformers/code/__init__.py delete mode 100644 docs/website/docs/examples/transformers/code/pokemon-snippets.py delete mode 100644 docs/website/docs/examples/transformers/index.md diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index 7a862c5800..bafbf4bbee 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -58,11 +58,19 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery --with docs,sentry-sdk --without airflow + run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres --with docs,sentry-sdk --without airflow - - name: create secrets.toml + - name: create secrets.toml for examples + run: pwd && echo "$DLT_SECRETS_TOML" > docs/examples/.dlt/secrets.toml + + - name: create secrets.toml for snippets run: pwd && echo "$DLT_SECRETS_TOML" > docs/website/docs/.dlt/secrets.toml - - name: Run linter and tests - run: make test-and-lint-snippets + - name: Run linter and tests on examples + run: make lint-and-test-examples + + - name: Run linter and tests on snippets + run: make lint-and-test-snippets + + diff --git a/.gitignore b/.gitignore index f26ea23d91..c88a0f5844 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,6 @@ experiments/* # !experiments/pipeline/ # !experiments/pipeline/* secrets.toml -!docs/**/secrets.toml *.session.sql *.duckdb *.wal @@ -141,4 +140,7 @@ tmp **/tmp # Qdrant embedding models cache -local_cache/ \ No newline at end of file +local_cache/ + +# test file for examples are generated and should not be committed +docs/examples/**/test*.py \ No newline at end of file diff --git a/Makefile b/Makefile index ebf633d1eb..4f7701e176 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ help: @echo " tests all components using local destinations: duckdb and postgres" @echo " test-common" @echo " tests common components" - @echo " test-and-lint-snippets" + @echo " lint-and-test-snippets" @echo " tests and lints snippets and examples in docs" @echo " build-library" @echo " makes dev and then builds dlt package for distribution" @@ -60,12 +60,22 @@ format: poetry run black dlt docs tests --exclude=".*syntax_error.py|\.venv.*|_storage/.*" # poetry run isort ./ -test-and-lint-snippets: +lint-and-test-snippets: cd docs/tools && poetry run python check_embedded_snippets.py full poetry run mypy --config-file mypy.ini docs/website docs/examples docs/tools --exclude docs/tools/lint_setup poetry run flake8 --max-line-length=200 docs/website docs/examples docs/tools cd docs/website/docs && poetry run pytest --ignore=node_modules +lint-and-test-examples: + poetry run mypy --config-file mypy.ini docs/examples + poetry run flake8 --max-line-length=200 docs/examples + cd docs/tools && poetry run python prepare_examples_tests.py + cd docs/examples && poetry run pytest + + +test-examples: + cd docs/examples && poetry run pytest + lint-security: poetry run bandit -r dlt/ -n 3 -l diff --git a/docs/examples/custom_destination_bigquery/.dlt/config.toml b/docs/examples/.dlt/config.toml similarity index 100% rename from docs/examples/custom_destination_bigquery/.dlt/config.toml rename to docs/examples/.dlt/config.toml diff --git a/docs/examples/CONTRIBUTING.md b/docs/examples/CONTRIBUTING.md index 3837300a2b..625a09d9c0 100644 --- a/docs/examples/CONTRIBUTING.md +++ b/docs/examples/CONTRIBUTING.md @@ -4,50 +4,27 @@ Note: All paths in this guide are relative to the `dlt` repository directory. ## Add snippet -- Go to `docs/website/docs/examples/`. -- Copy one of the examples, rename scripts. -- Modify the script in `/code/-snippets.py`: - - The whole example code should be inside of `def _snippet()` function. - - Use tags `# @@@DLT_SNIPPET_START example` and `# @@@DLT_SNIPPET_END example` to indicate which part of the code will be auto-generated in the final script `docs/examples//.py`. - - Use additional tags as `# @@@DLT_SNIPPET_START smal_part_of_code` to indicate which part of the code will be auto-inserted into a text document `docs/website/docs/examples//index.md` in the form of a code snippet. -- Modify .`dlt/secrets.toml` and `configs.toml` if needed. -- Modify `/index.md`: - - In the section `
`. - - List what users will learn from this example. Use bullet points and link corresponding documentation pages. - - Use tags `` to insert example code snippets. Do not write them manually! - -## Add tests - -- Do not forget to add tests to `/code/-snippets.py`. -- They could be short asserts, code should work. -- Use `# @@@DLT_REMOVE` to remove test code from final code example. -- Test your snippets locally first with command: - - `cd docs/website/docs/examples//code && pytest --ignore=node_modules -s -v`. -- Add `@skipifgithubfork` decorator to your main snippet function, look [example](https://github.com/dlt-hub/dlt/blob/master/docs/website/docs/examples/chess_production/code/chess-snippets.py#L1-L4). - -## Run npm start +- Go to `docs/examples/`. +- Copy the template in `./_template/..`. +- Make sure the folder and your examples script have the same name +- Update the doc string which will compromise the generated markdown file, check the other examples how it is done +- If your example requires any secrets, add the vars to the example.secrects.toml but do not enter the values. +- Add your example code, make sure you have a `if __name__ = "__main__"` clause in which you run the example script, this will be used for testing +- You should add one or two assertions after running your example and maybe also `load_info.raise_on_failed_jobs()`, this will help greatly with testing + +## Testing +- You can test your example simply by running your example script from your example folder. On CI a test will be automatically generated. + +## Checking your generated markdown The command `npm start` starts a local development server and opens up a browser window. - To install npm read [README](../website/README.md). -- This command will generate a clean example script in `docs/examples/` folder based on `docs/website/docs/examples//code/-snippets.py`. -- Also, this command automatically inserts code snippets to `docs/website/docs/examples//index.md`. +- You should your example be automatically added to the examples section in the local version of the docs. Check the rendered output and see wether it looks the way you intended. ## Add ENV variables -If you use any secrets for the code snippets, e.g. Zendesk requires credentials. You need to add them to GitHub Actions in ENV style: - -- First, add the variables to `.github/workflows/test_doc_snippets.yml`: - - Example: - - ```yaml - # zendesk vars for example - SOURCES__ZENDESK__CREDENTIALS: ${{ secrets.ZENDESK__CREDENTIALS }} - ``` - -- Ask dlt team to add them to the GitHub Secrets. +If you use any secrets for the code snippets, e.g. Zendesk requires credentials. Please talk to us. We will add them to our google secrets vault. ## Add dependencies diff --git a/docs/website/docs/examples/chess_production/__init__.py b/docs/examples/__init__.py similarity index 100% rename from docs/website/docs/examples/chess_production/__init__.py rename to docs/examples/__init__.py diff --git a/docs/examples/google_sheets/.dlt/config.toml b/docs/examples/_template/.dlt/config.toml similarity index 100% rename from docs/examples/google_sheets/.dlt/config.toml rename to docs/examples/_template/.dlt/config.toml diff --git a/docs/examples/incremental_loading/.dlt/config.toml b/docs/examples/_template/.dlt/example.secrets.toml similarity index 100% rename from docs/examples/incremental_loading/.dlt/config.toml rename to docs/examples/_template/.dlt/example.secrets.toml diff --git a/docs/website/docs/examples/chess_production/code/__init__.py b/docs/examples/_template/__init__.py similarity index 100% rename from docs/website/docs/examples/chess_production/code/__init__.py rename to docs/examples/_template/__init__.py diff --git a/docs/examples/_template/_template.py b/docs/examples/_template/_template.py new file mode 100644 index 0000000000..cdd38f8204 --- /dev/null +++ b/docs/examples/_template/_template.py @@ -0,0 +1,30 @@ +""" +--- +title: Example Template +description: Add desciption here +keywords: [example] +--- + +This is a template for a new example. This text will show up in the docs. + +With this example you will learn to: + +* One +* two +* Three + +""" + +import dlt + +if __name__ == "__main__": + # run a pipeline + pipeline = dlt.pipeline( + pipeline_name="example_pipeline", destination="duckdb", dataset_name="example_data" + ) + # Extract, normalize, and load the data + load_info = pipeline.run([1, 2, 3], table_name="player") + print(load_info) + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/examples/chess/chess.py b/docs/examples/chess/chess.py index 84fbf3cb07..df1fb18845 100644 --- a/docs/examples/chess/chess.py +++ b/docs/examples/chess/chess.py @@ -1,4 +1,3 @@ -import os import threading from typing import Any, Iterator @@ -49,12 +48,14 @@ def players_games(username: Any) -> Iterator[TDataItems]: if __name__ == "__main__": print("You must run this from the docs/examples/chess folder") - assert os.getcwd().endswith("chess") # chess_url in config.toml, credentials for postgres in secrets.toml, credentials always under credentials key # look for parallel run configuration in `config.toml`! # mind the full_refresh: it makes the pipeline to load to a distinct dataset each time it is run and always is resetting the schema and state - info = dlt.pipeline( + load_info = dlt.pipeline( pipeline_name="chess_games", destination="postgres", dataset_name="chess", full_refresh=True ).run(chess(max_players=5, month=9)) # display where the data went - print(info) + print(load_info) + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/examples/chess_production/.dlt/config.toml b/docs/examples/chess_production/.dlt/config.toml index e69de29bb2..898e4d9042 100644 --- a/docs/examples/chess_production/.dlt/config.toml +++ b/docs/examples/chess_production/.dlt/config.toml @@ -0,0 +1 @@ +chess_url="https://api.chess.com/pub/" diff --git a/docs/examples/chess_production/chess.py b/docs/examples/chess_production/chess_production.py similarity index 84% rename from docs/examples/chess_production/chess.py rename to docs/examples/chess_production/chess_production.py index e2d0b9c10d..c0f11203c8 100644 --- a/docs/examples/chess_production/chess.py +++ b/docs/examples/chess_production/chess_production.py @@ -1,10 +1,38 @@ +""" +--- +title: Run chess pipeline in production +description: Learn how run chess pipeline in production +keywords: [incremental loading, example] +--- + +In this example, you'll find a Python script that interacts with the Chess API to extract players and game data. + +We'll learn how to: + +- Inspecting packages after they have been loaded. +- Loading back load information, schema updates, and traces. +- Triggering notifications in case of schema evolution. +- Using context managers to independently retry pipeline stages. +- Run basic tests utilizing `sql_client` and `normalize_info`. + +""" + import threading from typing import Any, Iterator +from tenacity import ( + Retrying, + retry_if_exception, + stop_after_attempt, + wait_exponential, +) + import dlt -from dlt.common import sleep +from dlt.common import sleep, logger from dlt.common.typing import StrAny, TDataItems from dlt.sources.helpers.requests import client +from dlt.pipeline.helpers import retry_load +from dlt.common.runtime.slack import send_slack_message @dlt.source @@ -44,17 +72,6 @@ def players_games(username: Any) -> Iterator[TDataItems]: return players(), players_profiles, players_games -from tenacity import ( - Retrying, - retry_if_exception, - stop_after_attempt, - wait_exponential, -) - -from dlt.common import logger -from dlt.common.runtime.slack import send_slack_message -from dlt.pipeline.helpers import retry_load - MAX_PLAYERS = 5 @@ -107,6 +124,7 @@ def load_data_with_retry(pipeline, data): logger.info("Warning: No data in players table") else: logger.info(f"Players table contains {count} rows") + assert count == MAX_PLAYERS # To run simple tests with `normalize_info`, such as checking table counts and # warning if there is no data, you can use the `row_counts` attribute. @@ -116,13 +134,16 @@ def load_data_with_retry(pipeline, data): logger.info("Warning: No data in players table") else: logger.info(f"Players table contains {count} rows") + assert count == MAX_PLAYERS # we reuse the pipeline instance below and load to the same dataset as data logger.info("Saving the load info in the destination") pipeline.run([load_info], table_name="_load_info") + assert "_load_info" in pipeline.last_trace.last_normalize_info.row_counts # save trace to destination, sensitive data will be removed logger.info("Saving the trace in the destination") pipeline.run([pipeline.last_trace], table_name="_trace") + assert "_trace" in pipeline.last_trace.last_normalize_info.row_counts # print all the new tables/columns in for package in load_info.load_packages: @@ -134,6 +155,7 @@ def load_data_with_retry(pipeline, data): # save the new tables and column schemas to the destination: table_updates = [p.asdict()["tables"] for p in load_info.load_packages] pipeline.run(table_updates, table_name="_new_tables") + assert "_new_tables" in pipeline.last_trace.last_normalize_info.row_counts return load_info @@ -146,5 +168,8 @@ def load_data_with_retry(pipeline, data): dataset_name="chess_data", ) # get data for a few famous players - data = chess(chess_url="https://api.chess.com/pub/", max_players=MAX_PLAYERS) - load_data_with_retry(pipeline, data) + data = chess(max_players=MAX_PLAYERS) + load_info = load_data_with_retry(pipeline, data) + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py new file mode 100644 index 0000000000..87ccffe53b --- /dev/null +++ b/docs/examples/conftest.py @@ -0,0 +1,57 @@ +import os +import pytest +from unittest.mock import patch + +from dlt.common.configuration.container import Container +from dlt.common.configuration.providers import ( + ConfigTomlProvider, + EnvironProvider, + SecretsTomlProvider, + StringTomlProvider, +) +from dlt.common.configuration.specs.config_providers_context import ( + ConfigProvidersContext, +) +from dlt.common.utils import set_working_dir + +from tests.utils import ( + patch_home_dir, + autouse_test_storage, + preserve_environ, + duckdb_pipeline_location, + wipe_pipeline, +) + + +@pytest.fixture(autouse=True) +def setup_secret_providers(request): + """Creates set of config providers where tomls are loaded from tests/.dlt""" + secret_dir = "./.dlt" + dname = os.path.dirname(request.module.__file__) + config_dir = dname + "/.dlt" + + # inject provider context so the original providers are restored at the end + def _initial_providers(): + return [ + EnvironProvider(), + SecretsTomlProvider(project_dir=secret_dir, add_global_config=False), + ConfigTomlProvider(project_dir=config_dir, add_global_config=False), + ] + + glob_ctx = ConfigProvidersContext() + glob_ctx.providers = _initial_providers() + + with set_working_dir(dname), Container().injectable_context(glob_ctx), patch( + "dlt.common.configuration.specs.config_providers_context.ConfigProvidersContext.initial_providers", + _initial_providers, + ): + # extras work when container updated + glob_ctx.add_extras() + yield + + +def pytest_configure(config): + # push sentry to ci + os.environ["RUNTIME__SENTRY_DSN"] = ( + "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + ) diff --git a/docs/examples/connector_x_arrow/connector_x_arrow.py b/docs/examples/connector_x_arrow/connector_x_arrow.py new file mode 100644 index 0000000000..9603fb2ba0 --- /dev/null +++ b/docs/examples/connector_x_arrow/connector_x_arrow.py @@ -0,0 +1,72 @@ +""" +--- +title: Load mysql table with ConnectorX & Arrow +description: Load data from sql queries fast with connector x and arrow tables +keywords: [connector x, pyarrow, zero copy] +--- + +The example script below takes genome data from public **mysql** instance and then loads it into **duckdb**. Mind that your destination +must support loading of parquet files as this is the format that `dlt` uses to save arrow tables. [Connector X](https://github.com/sfu-db/connector-x) allows to +get data from several popular databases and creates in memory Arrow table which `dlt` then saves to load package and loads to the destination. +:::tip +You can yield several tables if your data is large and you need to partition your load. +::: + +We'll learn: + +- How to get arrow tables from [connector X](https://github.com/sfu-db/connector-x) and yield them. +- That merge and incremental loads work with arrow tables. +- How to enable [incremental loading](../general-usage/incremental-loading) for efficient data extraction. +- How to use build in ConnectionString credentials. + +""" + +import connectorx as cx + +import dlt +from dlt.sources.credentials import ConnectionStringCredentials + + +def read_sql_x( + conn_str: ConnectionStringCredentials = dlt.secrets.value, + query: str = dlt.config.value, +): + yield cx.read_sql( + conn_str.to_native_representation(), + query, + return_type="arrow2", + protocol="binary", + ) + + +def genome_resource(): + # create genome resource with merge on `upid` primary key + genome = dlt.resource( + name="genome", + write_disposition="merge", + primary_key="upid", + standalone=True, + )(read_sql_x)( + "mysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", # type: ignore[arg-type] + "SELECT * FROM genome ORDER BY created LIMIT 1000", + ) + # add incremental on created at + genome.apply_hints(incremental=dlt.sources.incremental("created")) + return genome + + +if __name__ == "__main__": + pipeline = dlt.pipeline(destination="duckdb") + genome = genome_resource() + + load_info = pipeline.run(genome) + print(load_info) + print(pipeline.last_trace.last_normalize_info) + # NOTE: run pipeline again to see that no more records got loaded thanks to incremental loading + + # check that stuff was loaded + row_counts = pipeline.last_trace.last_normalize_info.row_counts + assert row_counts["genome"] == 1000 + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/examples/connector_x_arrow/load_arrow.py b/docs/examples/connector_x_arrow/load_arrow.py deleted file mode 100644 index b3c654cef9..0000000000 --- a/docs/examples/connector_x_arrow/load_arrow.py +++ /dev/null @@ -1,41 +0,0 @@ -import connectorx as cx - -import dlt -from dlt.sources.credentials import ConnectionStringCredentials - - -def read_sql_x( - conn_str: ConnectionStringCredentials = dlt.secrets.value, - query: str = dlt.config.value, -): - yield cx.read_sql( - conn_str.to_native_representation(), - query, - return_type="arrow2", - protocol="binary", - ) - - -def genome_resource(): - # create genome resource with merge on `upid` primary key - genome = dlt.resource( - name="genome", - write_disposition="merge", - primary_key="upid", - standalone=True, - )(read_sql_x)( - "mysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", # type: ignore[arg-type] - "SELECT * FROM genome ORDER BY created LIMIT 1000", - ) - # add incremental on created at - genome.apply_hints(incremental=dlt.sources.incremental("created")) - return genome - - -if __name__ == "__main__": - pipeline = dlt.pipeline(destination="duckdb") - genome = genome_resource() - - print(pipeline.run(genome)) - print(pipeline.last_trace.last_normalize_info) - # NOTE: run pipeline again to see that no more records got loaded thanks to incremental loading diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py index 624888f70a..ea60b9b00d 100644 --- a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -1,3 +1,20 @@ +""" +--- +title: Custom destination with BigQuery +description: Learn how use the custom destination to load to bigquery and use credentials +keywords: [destination, credentials, example, bigquery, custom destination] +--- + +In this example, you'll find a Python script that demonstrates how to load to bigquey with the custom destination. + +We'll learn how to: +- use [built-in credentials](../general-usage/credentials/config_specs#gcp-credentials) +- use the [custom destination](../dlt-ecosystem/destinations/destination.md) +- Use pyarrow tables to create complex column types on bigquery +- Use bigquery `autodetect=True` for schema inference from parquet files + +""" + import dlt import pandas as pd import pyarrow as pa @@ -72,3 +89,6 @@ def bigquery_insert( load_info = pipeline.run(resource(url=OWID_DISASTERS_URL)) print(load_info) + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/examples/google_sheets/.dlt/example.secrets.toml b/docs/examples/google_sheets/.dlt/example.secrets.toml index 42feceddfc..d86bcc9e21 100644 --- a/docs/examples/google_sheets/.dlt/example.secrets.toml +++ b/docs/examples/google_sheets/.dlt/example.secrets.toml @@ -2,15 +2,15 @@ [sources.google_sheets] credentials=''' { -"type": "set me up!", -"project_id": "set me up!", -"private_key_id": "set me up!", -"private_key": "set me up!", -"client_email": "set me up!", -"client_id": "set me up!", -"auth_uri": "https://accounts.google.com/o/oauth2/auth", -"token_uri": "https://oauth2.googleapis.com/token", -"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", -"client_x509_cert_url": "set me up!" -} -''' \ No newline at end of file + "type": "set me up!", + "project_id": "set me up!", + "private_key_id": "set me up!", + "private_key": "set me up!", + "client_email": "set me up!", + "client_id": "set me up!", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "set me up!" + } +''' diff --git a/docs/examples/google_sheets/google_sheets.py b/docs/examples/google_sheets/google_sheets.py index 1ba330e4ca..1a3923136c 100644 --- a/docs/examples/google_sheets/google_sheets.py +++ b/docs/examples/google_sheets/google_sheets.py @@ -1,3 +1,26 @@ +""" +--- +title: Google Sheets minimal example +description: Learn how work with Google services +keywords: [google sheets, credentials, example] +--- + +In this example, you'll find a Python script that demonstrates how to load Google Sheets data using the `dlt` library. + +We'll learn how to: +- use [built-in credentials](../general-usage/credentials/config_specs#gcp-credentials); +- use [union of credentials](../general-usage/credentials/config_specs#working-with-alternatives-of-credentials-union-types); +- create [dynamically generated resources](../general-usage/source#create-resources-dynamically). + +:::tip +This example is for educational purposes. For best practices, we recommend using [Google Sheets verified source](../dlt-ecosystem/verified-sources/google_sheets.md). +::: + +""" + +# NOTE: this line is only for dlt CI purposes, you may delete it if you are using this example +__source_name__ = "google_sheets" + from typing import Any, Iterator, Sequence, Union, cast from googleapiclient.discovery import build @@ -64,10 +87,18 @@ def get_sheet(sheet_name: str) -> Iterator[DictStrAny]: sheet_id = "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580" range_names = ["hidden_columns_merged_cells", "Blank Columns"] # "2022-05", "model_metadata" - info = pipeline.run( + load_info = pipeline.run( google_spreadsheet( spreadsheet_id=sheet_id, sheet_names=range_names, ) ) - print(info) + print(load_info) + + row_counts = pipeline.last_trace.last_normalize_info.row_counts + print(row_counts.keys()) + assert row_counts["hidden_columns_merged_cells"] == 7 + assert row_counts["blank_columns"] == 21 + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/examples/incremental_loading/.dlt/example.secrets.toml b/docs/examples/incremental_loading/.dlt/example.secrets.toml index 4dec919c06..7468b6d9b4 100644 --- a/docs/examples/incremental_loading/.dlt/example.secrets.toml +++ b/docs/examples/incremental_loading/.dlt/example.secrets.toml @@ -1,4 +1,4 @@ [sources.zendesk.credentials] password = "" subdomain = "" -email = "" \ No newline at end of file +email = "" diff --git a/docs/examples/incremental_loading/zendesk.py b/docs/examples/incremental_loading/incremental_loading.py similarity index 74% rename from docs/examples/incremental_loading/zendesk.py rename to docs/examples/incremental_loading/incremental_loading.py index 6113f98793..f1de4eecfe 100644 --- a/docs/examples/incremental_loading/zendesk.py +++ b/docs/examples/incremental_loading/incremental_loading.py @@ -1,3 +1,25 @@ +""" +--- +title: Load Zendesk tickets incrementally +description: Learn how do incremental loading in consecutive runs +keywords: [incremental loading, example] +--- + +In this example, you'll find a Python script that interacts with the Zendesk Support API to extract ticket events data. + +We'll learn: + +- How to pass [credentials](../general-usage/credentials) as dict and how to type the `@dlt.source` function arguments. +- How to set [the nesting level](../general-usage/source#reduce-the-nesting-level-of-generated-tables). +- How to enable [incremental loading](../general-usage/incremental-loading) for efficient data extraction. +- How to specify [the start and end dates](../general-usage/incremental-loading#using-dltsourcesincremental-for-backfill) for the data loading and how to [opt-in to Airflow scheduler](../general-usage/incremental-loading#using-airflow-schedule-for-backfill-and-incremental-loading) by setting `allow_external_schedulers` to `True`. +- How to work with timestamps, specifically converting them to Unix timestamps for incremental data extraction. +- How to use the `start_time` parameter in API requests to retrieve data starting from a specific timestamp. +""" + +# NOTE: this line is only for dlt CI purposes, you may delete it if you are using this example +__source_name__ = "zendesk" + from typing import Optional, Dict, Any, Tuple import dlt @@ -121,3 +143,10 @@ def get_pages( load_info = pipeline.run(zendesk_support()) print(load_info) + + # check that stuff was loaded + row_counts = pipeline.last_trace.last_normalize_info.row_counts + assert row_counts["ticket_events"] == 17 + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/examples/nested_data/.dlt/config.toml b/docs/examples/nested_data/.dlt/config.toml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/examples/nested_data/.dlt/example.secrets.toml b/docs/examples/nested_data/.dlt/example.secrets.toml index d014b4389a..5fba089a18 100644 --- a/docs/examples/nested_data/.dlt/example.secrets.toml +++ b/docs/examples/nested_data/.dlt/example.secrets.toml @@ -1,2 +1,2 @@ [sources.mongodb] -connection_url="" \ No newline at end of file +connection_url="" diff --git a/docs/examples/nested_data/nested_data.py b/docs/examples/nested_data/nested_data.py index 7f85f0522e..afda16a51a 100644 --- a/docs/examples/nested_data/nested_data.py +++ b/docs/examples/nested_data/nested_data.py @@ -1,3 +1,24 @@ +""" +--- +title: Control nested MongoDB data +description: Learn how control nested data +keywords: [incremental loading, example] +--- + +In this example, you'll find a Python script that demonstrates how to control nested data using the `dlt` library. + +We'll learn how to: +- [Adjust maximum nesting level in three ways:](../general-usage/source#reduce-the-nesting-level-of-generated-tables) + - Limit nesting levels with dlt decorator. + - Dynamic nesting level adjustment. + - Apply data type hints. +- Work with [MongoDB](../dlt-ecosystem/verified-sources/mongodb) in Python and `dlt`. +- Enable [incremental loading](../general-usage/incremental-loading) for efficient data extraction. +""" + +# NOTE: this line is only for dlt CI purposes, you may delete it if you are using this example +__source_name__ = "mongodb" + from itertools import islice from typing import Any, Dict, Iterator, Optional @@ -103,6 +124,12 @@ def convert_mongo_objs(value: Any) -> Any: source_data = mongodb_collection(collection="movies", write_disposition="replace") load_info = pipeline.run(source_data) print(load_info) + tables = pipeline.last_trace.last_normalize_info.row_counts + tables.pop("_dlt_pipeline_state") + assert len(tables) == 7, pipeline.last_trace.last_normalize_info + + # make sure nothing failed + load_info.raise_on_failed_jobs() # The second method involves setting the max_table_nesting attribute directly # on the source data object. @@ -118,6 +145,12 @@ def convert_mongo_objs(value: Any) -> Any: source_data.max_table_nesting = 0 load_info = pipeline.run(source_data) print(load_info) + tables = pipeline.last_trace.last_normalize_info.row_counts + tables.pop("_dlt_pipeline_state") + assert len(tables) == 1, pipeline.last_trace.last_normalize_info + + # make sure nothing failed + load_info.raise_on_failed_jobs() # The third method involves applying data type hints to specific columns in the data. # In this case, we tell dlt that column 'cast' (containing a list of actors) @@ -132,3 +165,9 @@ def convert_mongo_objs(value: Any) -> Any: source_data.movies.apply_hints(columns={"cast": {"data_type": "complex"}}) load_info = pipeline.run(source_data) print(load_info) + tables = pipeline.last_trace.last_normalize_info.row_counts + tables.pop("_dlt_pipeline_state") + assert len(tables) == 6, pipeline.last_trace.last_normalize_info + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/website/docs/examples/pdf_to_weaviate/code/assets/invoices/invoice_2.txt b/docs/examples/pdf_to_weaviate/assets/invoices/invoice_2.txt similarity index 100% rename from docs/website/docs/examples/pdf_to_weaviate/code/assets/invoices/invoice_2.txt rename to docs/examples/pdf_to_weaviate/assets/invoices/invoice_2.txt diff --git a/docs/website/docs/examples/pdf_to_weaviate/code/assets/invoices/invoice_20230831-p1.pdf b/docs/examples/pdf_to_weaviate/assets/invoices/invoice_20230831-p1.pdf similarity index 100% rename from docs/website/docs/examples/pdf_to_weaviate/code/assets/invoices/invoice_20230831-p1.pdf rename to docs/examples/pdf_to_weaviate/assets/invoices/invoice_20230831-p1.pdf diff --git a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py index e7f57853ed..809a6cfbd6 100644 --- a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py +++ b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py @@ -1,5 +1,29 @@ -import os +""" +--- +title: Load PDFs to Weaviate +description: Extract text from PDF and load it into a vector database +keywords: [pdf, weaviate, vector store, vector database, ] +--- + +We'll use PyPDF2 to extract text from PDFs. Make sure you have it installed: + +```sh +pip install PyPDF2 +``` + +We start with a simple resource that lists files in specified folder. To that we add a **filter** function that removes all files that are not pdfs. + +To parse PDFs we use [PyPDF](https://pypdf2.readthedocs.io/en/3.0.0/user/extract-text.html) and return each page from a given PDF as separate data item. +Parsing happens in `@dlt.transformer` which receives data from `list_files` resource. It splits PDF into pages, extracts text and yields pages separately +so each PDF will correspond to many items in Weaviate `InvoiceText` class. We set the primary key and use merge disposition so if the same PDF comes twice +we'll just update the vectors, and not duplicate. + +Look how we pipe data from `list_files` resource (note that resource is deselected so we do not load raw file items to destination) into `pdf_to_text` using **|** operator. + +""" + +import os import dlt from dlt.destinations.impl.weaviate import weaviate_adapter from PyPDF2 import PdfReader @@ -31,27 +55,31 @@ def pdf_to_text(file_item, separate_pages: bool = False): yield page_item -pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate") +if __name__ == "__main__": + pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate") + + # this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf" + # (3) sends them to pdf_to_text transformer with pipe (|) operator + pdf_pipeline = list_files("assets/invoices").add_filter( + lambda item: item["file_name"].endswith(".pdf") + ) | pdf_to_text(separate_pages=True) -# this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf" -# (3) sends them to pdf_to_text transformer with pipe (|) operator -pdf_pipeline = list_files("assets/invoices").add_filter( - lambda item: item["file_name"].endswith(".pdf") -) | pdf_to_text(separate_pages=True) + # set the name of the destination table to receive pages + # NOTE: Weaviate, dlt's tables are mapped to classes + pdf_pipeline.table_name = "InvoiceText" -# set the name of the destination table to receive pages -# NOTE: Weaviate, dlt's tables are mapped to classes -pdf_pipeline.table_name = "InvoiceText" + # use weaviate_adapter to tell destination to vectorize "text" column + load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text")) + row_counts = pipeline.last_trace.last_normalize_info + print(row_counts) + print("------") + print(load_info) -# use weaviate_adapter to tell destination to vectorize "text" column -load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text")) -row_counts = pipeline.last_trace.last_normalize_info -print(row_counts) -print("------") -print(load_info) + import weaviate -import weaviate + client = weaviate.Client("http://localhost:8080") + # get text of all the invoices in InvoiceText class we just created above + print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do()) -client = weaviate.Client("http://localhost:8080") -# get text of all the invoices in InvoiceText class we just created above -print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do()) + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/examples/qdrant_zendesk/.dlt/example.secrets.toml b/docs/examples/qdrant_zendesk/.dlt/example.secrets.toml index 623033d4d0..c34d519aa5 100644 --- a/docs/examples/qdrant_zendesk/.dlt/example.secrets.toml +++ b/docs/examples/qdrant_zendesk/.dlt/example.secrets.toml @@ -5,4 +5,4 @@ api_key = "" [sources.zendesk.credentials] password = "" subdomain = "" -email = "" \ No newline at end of file +email = "" diff --git a/docs/examples/qdrant_zendesk/qdrant.py b/docs/examples/qdrant_zendesk/qdrant_zendesk.py similarity index 79% rename from docs/examples/qdrant_zendesk/qdrant.py rename to docs/examples/qdrant_zendesk/qdrant_zendesk.py index bd0cbafc99..65f399104a 100644 --- a/docs/examples/qdrant_zendesk/qdrant.py +++ b/docs/examples/qdrant_zendesk/qdrant_zendesk.py @@ -1,3 +1,33 @@ +""" +--- +title: Similarity Searching with Qdrant +description: Learn how to use the dlt source, Zendesk and dlt destination, Qdrant to conduct a similarity search on your tickets data. +keywords: [similarity search, example] +--- + +This article outlines a system to map vectorized ticket data from Zendesk to Qdrant, similar to our guide on the topic concerning [Weaviate](https://dlthub.com/docs/dlt-ecosystem/destinations/qdrant). In this example, we will: +- Connect to our [Zendesk source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/zendesk). +- Extract tickets data from our Zendesk source. +- [Create a dlt pipeline](https://dlthub.com/docs/walkthroughs/create-a-pipeline) with Qdrant as destination. +- Vectorize/embed the tickets data from Zendesk. +- Pass the vectorized data to be stored in Qdrant via the dlt pipeline. +- Query data that we stored in Qdrant. +- Explore the similarity search results. + +First, configure the destination credentials for [Qdrant](https://dlthub.com/docs/dlt-ecosystem/destinations/qdrant#setup-guide) and [Zendesk](https://dlthub.com/docs/walkthroughs/zendesk-weaviate#configuration) in `.dlt/secrets.toml`. + +Next, make sure you have the following dependencies installed: + +```sh +pip install qdrant-client>=1.6.9 +pip install fastembed>=0.1.1 +``` + +""" + +# NOTE: this line is only for dlt CI purposes, you may delete it if you are using this example +__source_name__ = "zendesk" + from typing import Optional, Dict, Any, Tuple import dlt @@ -148,6 +178,9 @@ def get_pages( print(load_info) + # make sure nothing failed + load_info.raise_on_failed_jobs() + # running the Qdrant client to connect to your Qdrant database @with_config(sections=("destination", "qdrant", "credentials")) @@ -169,3 +202,8 @@ def get_qdrant_client(location=dlt.secrets.value, api_key=dlt.secrets.value): query_text=["cancel", "cancel subscription"], # prompt to search limit=3, # limit the number of results to the nearest 3 embeddings ) + + assert len(response) <= 3 and len(response) > 0 + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/examples/transformers/.dlt/config.toml b/docs/examples/transformers/.dlt/config.toml index a366f34edf..251808e8ef 100644 --- a/docs/examples/transformers/.dlt/config.toml +++ b/docs/examples/transformers/.dlt/config.toml @@ -13,4 +13,4 @@ workers=3 [load] # have 50 concurrent load jobs -workers=50 \ No newline at end of file +workers=50 diff --git a/docs/examples/transformers/pokemon.py b/docs/examples/transformers/transformers.py similarity index 64% rename from docs/examples/transformers/pokemon.py rename to docs/examples/transformers/transformers.py index ca32c570ef..14d23de12d 100644 --- a/docs/examples/transformers/pokemon.py +++ b/docs/examples/transformers/transformers.py @@ -1,11 +1,28 @@ +""" +--- +title: Pokemon details in parallel using transformers +description: Learn how to use dlt transformers and how to speed up your loads with parallelism +keywords: [transformers, parallelism, example] +--- + +For this example, we will be loading Pokemon data from the [PokeAPI](https://pokeapi.co/) with the help of transformers to load +Pokemon details in parallel. + +We'll learn how to: +- create 2 [transformers](../general-usage/resource.md#feeding-data-from-one-resource-into-another) and connect them to a resource with the pipe operator `|`; +- [load these transformers in parallel](../reference/performance.md#parallelism) using the `@dlt.defer` decorator; +- [configure parallelism](../reference/performance.md#parallel-pipeline-config-example) in the `config.toml` file; +- deselect the main resource, so it will not be loaded into the database; +- importing and using a pre-configured `requests` library with automatic retries (`from dlt.sources.helpers import requests`). + +""" + import dlt from dlt.sources.helpers import requests @dlt.source(max_table_nesting=2) def source(pokemon_api_url: str): - """""" - # note that we deselect `pokemon_list` - we do not want it to be loaded @dlt.resource(write_disposition="replace", selected=False) def pokemon_list(): @@ -55,3 +72,12 @@ def species(pokemon_details): # the pokemon_list resource does not need to be loaded load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon")) print(load_info) + + # verify that all went well + row_counts = pipeline.last_trace.last_normalize_info.row_counts + assert row_counts["pokemon"] == 20 + assert row_counts["species"] == 20 + assert "pokemon_list" not in row_counts + + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/tools/prepare_examples_tests.py b/docs/tools/prepare_examples_tests.py new file mode 100644 index 0000000000..c34c7dffd6 --- /dev/null +++ b/docs/tools/prepare_examples_tests.py @@ -0,0 +1,76 @@ +""" +Creates the pytest files for our examples tests. These will not be committed +""" +import os +import argparse + +import dlt.cli.echo as fmt + +EXAMPLES_DIR = "../examples" + +# settings +SKIP_FOLDERS = ["archive", ".", "_", "local_cache"] + +# the entry point for the script +MAIN_CLAUSE = 'if __name__ == "__main__":' + +# some stuff to insert for setting up and tearing down fixtures +TEST_HEADER = """ +from tests.utils import skipifgithubfork + +""" + + +if __name__ == "__main__": + # setup cli + parser = argparse.ArgumentParser( + description="Prepares examples in docs/examples for testing.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-c", "--clear", help="Remove all generated test files", action="store_true" + ) + + # get args + args = parser.parse_args() + + count = 0 + for example in next(os.walk(EXAMPLES_DIR))[1]: + # skip some + if any(map(lambda skip: example.startswith(skip), SKIP_FOLDERS)): + continue + + count += 1 + example_file = f"{EXAMPLES_DIR}/{example}/{example}.py" + test_example_file = f"{EXAMPLES_DIR}/{example}/test_{example}.py" + + if args.clear: + os.unlink(test_example_file) + continue + + with open(example_file, "r", encoding="utf-8") as f: + lines = f.read().split("\n") + + processed_lines = TEST_HEADER.split("\n") + main_clause_found = False + + for line in lines: + # convert the main clause to a test function + if line.startswith(MAIN_CLAUSE): + main_clause_found = True + processed_lines.append("@skipifgithubfork") + processed_lines.append(f"def test_{example}():") + else: + processed_lines.append(line) + + if not main_clause_found: + fmt.error(f"No main clause defined for example {example}") + exit(1) + + with open(test_example_file, "w", encoding="utf-8") as f: + f.write("\n".join(processed_lines)) + + if args.clear: + fmt.note("Cleared generated test files.") + else: + fmt.note(f"Prepared {count} examples for testing.") diff --git a/docs/website/docs/examples/_examples-header.md b/docs/website/docs/examples/_examples-header.md deleted file mode 100644 index 5632f57b35..0000000000 --- a/docs/website/docs/examples/_examples-header.md +++ /dev/null @@ -1,21 +0,0 @@ -import Admonition from "@theme/Admonition"; -import CodeBlock from '@theme/CodeBlock'; - - - The source code for this example can be found in our repository at: {"https://github.com/dlt-hub/dlt/tree/devel/docs/examples/" + props.slug}. - - -## TLDR -
{props.intro}
- -## Setup: Running this example on your machine - -{`# clone the dlt repository -git clone git@github.com:dlt-hub/dlt.git -# go to example directory -cd ./dlt/docs/examples/${props.slug} -# install dlt with ${props.destination} -pip install "dlt[${props.destination}]" -# run the example script -python ${props.run_file}.py`} - diff --git a/docs/website/docs/examples/chess_production/code/.dlt/config.toml b/docs/website/docs/examples/chess_production/code/.dlt/config.toml deleted file mode 100644 index be627e6c11..0000000000 --- a/docs/website/docs/examples/chess_production/code/.dlt/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -# @@@DLT_SNIPPET_START example -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/chess_production/code/chess-snippets.py b/docs/website/docs/examples/chess_production/code/chess-snippets.py deleted file mode 100644 index 39ddf14836..0000000000 --- a/docs/website/docs/examples/chess_production/code/chess-snippets.py +++ /dev/null @@ -1,171 +0,0 @@ -from tests.utils import skipifgithubfork - - -@skipifgithubfork -def incremental_snippet() -> None: - # @@@DLT_SNIPPET_START example - # @@@DLT_SNIPPET_START markdown_source - import threading - from typing import Any, Iterator - - import dlt - from dlt.common import sleep - from dlt.common.typing import StrAny, TDataItems - from dlt.sources.helpers.requests import client - - @dlt.source - def chess( - chess_url: str = dlt.config.value, - title: str = "GM", - max_players: int = 2, - year: int = 2022, - month: int = 10, - ) -> Any: - def _get_data_with_retry(path: str) -> StrAny: - r = client.get(f"{chess_url}{path}") - return r.json() # type: ignore - - @dlt.resource(write_disposition="replace") - def players() -> Iterator[TDataItems]: - # return players one by one, you could also return a list - # that would be faster but we want to pass players item by item to the transformer - yield from _get_data_with_retry(f"titled/{title}")["players"][:max_players] - - # this resource takes data from players and returns profiles - # it uses `paralellized` flag to enable parallel run in thread pool. - @dlt.transformer(data_from=players, write_disposition="replace", parallelized=True) - def players_profiles(username: Any) -> TDataItems: - print(f"getting {username} profile via thread {threading.current_thread().name}") - sleep(1) # add some latency to show parallel runs - return _get_data_with_retry(f"player/{username}") - - # this resource takes data from players and returns games for the last month - # if not specified otherwise - @dlt.transformer(data_from=players, write_disposition="append") - def players_games(username: Any) -> Iterator[TDataItems]: - # https://api.chess.com/pub/player/{username}/games/{YYYY}/{MM} - path = f"player/{username}/games/{year:04d}/{month:02d}" - yield _get_data_with_retry(path)["games"] - - return players(), players_profiles, players_games - - # @@@DLT_SNIPPET_END markdown_source - - # @@@DLT_SNIPPET_START markdown_retry_cm - from tenacity import ( - Retrying, - retry_if_exception, - stop_after_attempt, - wait_exponential, - ) - - from dlt.common import logger - from dlt.common.runtime.slack import send_slack_message - from dlt.pipeline.helpers import retry_load - - MAX_PLAYERS = 5 - - def load_data_with_retry(pipeline, data): - try: - for attempt in Retrying( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=1.5, min=4, max=10), - retry=retry_if_exception(retry_load(())), - reraise=True, - ): - with attempt: - logger.info( - f"Running the pipeline, attempt={attempt.retry_state.attempt_number}" - ) - load_info = pipeline.run(data) - logger.info(str(load_info)) - - # raise on failed jobs - load_info.raise_on_failed_jobs() - # send notification - send_slack_message( - pipeline.runtime_config.slack_incoming_hook, "Data was successfully loaded!" - ) - except Exception: - # we get here after all the failed retries - # send notification - send_slack_message(pipeline.runtime_config.slack_incoming_hook, "Something went wrong!") - raise - - # we get here after a successful attempt - # see when load was started - logger.info(f"Pipeline was started: {load_info.started_at}") - # print the information on the first load package and all jobs inside - logger.info(f"First load package info: {load_info.load_packages[0]}") - # print the information on the first completed job in first load package - logger.info( - f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}" - ) - - # check for schema updates: - schema_updates = [p.schema_update for p in load_info.load_packages] - # send notifications if there are schema updates - if schema_updates: - # send notification - send_slack_message(pipeline.runtime_config.slack_incoming_hook, "Schema was updated!") - - # To run simple tests with `sql_client`, such as checking table counts and - # warning if there is no data, you can use the `execute_query` method - with pipeline.sql_client() as client: - with client.execute_query("SELECT COUNT(*) FROM players") as cursor: - count = cursor.fetchone()[0] - if count == 0: - logger.info("Warning: No data in players table") - else: - logger.info(f"Players table contains {count} rows") - assert count == MAX_PLAYERS # @@@DLT_REMOVE - - # To run simple tests with `normalize_info`, such as checking table counts and - # warning if there is no data, you can use the `row_counts` attribute. - normalize_info = pipeline.last_trace.last_normalize_info - count = normalize_info.row_counts.get("players", 0) - if count == 0: - logger.info("Warning: No data in players table") - else: - logger.info(f"Players table contains {count} rows") - assert count == MAX_PLAYERS # @@@DLT_REMOVE - - # we reuse the pipeline instance below and load to the same dataset as data - logger.info("Saving the load info in the destination") - pipeline.run([load_info], table_name="_load_info") - assert "_load_info" in pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE - # save trace to destination, sensitive data will be removed - logger.info("Saving the trace in the destination") - pipeline.run([pipeline.last_trace], table_name="_trace") - assert "_trace" in pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE - - # print all the new tables/columns in - for package in load_info.load_packages: - for table_name, table in package.schema_update.items(): - logger.info(f"Table {table_name}: {table.get('description')}") - for column_name, column in table["columns"].items(): - logger.info(f"\tcolumn {column_name}: {column['data_type']}") - - # save the new tables and column schemas to the destination: - table_updates = [p.asdict()["tables"] for p in load_info.load_packages] - pipeline.run(table_updates, table_name="_new_tables") - assert "_new_tables" in pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE - - return load_info - - # @@@DLT_SNIPPET_END markdown_retry_cm - - # @@@DLT_SNIPPET_START markdown_pipeline - __name__ = "__main__" # @@@DLT_REMOVE - if __name__ == "__main__": - # create dlt pipeline - pipeline = dlt.pipeline( - pipeline_name="chess_pipeline", - destination="duckdb", - dataset_name="chess_data", - ) - # get data for a few famous players - data = chess(chess_url="https://api.chess.com/pub/", max_players=MAX_PLAYERS) - load_data_with_retry(pipeline, data) - # @@@DLT_SNIPPET_END markdown_pipeline - # @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/chess_production/index.md b/docs/website/docs/examples/chess_production/index.md deleted file mode 100644 index 704ea4012f..0000000000 --- a/docs/website/docs/examples/chess_production/index.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: Run chess pipeline in production -description: Learn how run chess pipeline in production -keywords: [incremental loading, example] ---- - -import Header from '../_examples-header.md'; - -
- -## Run chess pipeline in production - -In this example, you'll find a Python script that interacts with the Chess API to extract players and game data. - -We'll learn how to: - -- Inspecting packages after they have been loaded. -- Loading back load information, schema updates, and traces. -- Triggering notifications in case of schema evolution. -- Using context managers to independently retry pipeline stages. -- Run basic tests utilizing `sql_client` and `normalize_info`. - -### Init chess source - - - -### Using context managers to retry pipeline stages separately - - - -:::warning -To run this example you need to provide Slack incoming hook in `.dlt/secrets.toml`: -```py -[runtime] -slack_incoming_hook="https://hooks.slack.com/services/***" -``` -Read [Using Slack to send messages.](https://dlthub.com/docs/running-in-production/running#using-slack-to-send-messages) -::: - -### Run the pipeline - - - diff --git a/docs/website/docs/examples/connector_x_arrow/__init__.py b/docs/website/docs/examples/connector_x_arrow/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/connector_x_arrow/code/__init__.py b/docs/website/docs/examples/connector_x_arrow/code/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/connector_x_arrow/code/load_arrow-snippets.py b/docs/website/docs/examples/connector_x_arrow/code/load_arrow-snippets.py deleted file mode 100644 index db96efab86..0000000000 --- a/docs/website/docs/examples/connector_x_arrow/code/load_arrow-snippets.py +++ /dev/null @@ -1,52 +0,0 @@ -def connector_x_snippet() -> None: - # @@@DLT_SNIPPET_START example - # @@@DLT_SNIPPET_START markdown_source - import connectorx as cx - - import dlt - from dlt.sources.credentials import ConnectionStringCredentials - - def read_sql_x( - conn_str: ConnectionStringCredentials = dlt.secrets.value, - query: str = dlt.config.value, - ): - yield cx.read_sql( - conn_str.to_native_representation(), - query, - return_type="arrow2", - protocol="binary", - ) - - def genome_resource(): - # create genome resource with merge on `upid` primary key - genome = dlt.resource( - name="genome", - write_disposition="merge", - primary_key="upid", - standalone=True, - )(read_sql_x)( - "mysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", # type: ignore[arg-type] - "SELECT * FROM genome ORDER BY created LIMIT 1000", - ) - # add incremental on created at - genome.apply_hints(incremental=dlt.sources.incremental("created")) - return genome - - # @@@DLT_SNIPPET_END markdown_source - - # @@@DLT_SNIPPET_START markdown_pipeline - __name__ = "__main__" # @@@DLT_REMOVE - if __name__ == "__main__": - pipeline = dlt.pipeline(destination="duckdb") - genome = genome_resource() - - print(pipeline.run(genome)) - print(pipeline.last_trace.last_normalize_info) - # NOTE: run pipeline again to see that no more records got loaded thanks to incremental loading - # @@@DLT_SNIPPET_END markdown_pipeline - - # check that stuff was loaded # @@@DLT_REMOVE - row_counts = pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE - assert row_counts["genome"] == 1000 # @@@DLT_REMOVE - - # @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/connector_x_arrow/index.md b/docs/website/docs/examples/connector_x_arrow/index.md deleted file mode 100644 index 95775962d8..0000000000 --- a/docs/website/docs/examples/connector_x_arrow/index.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -title: Load mysql table with ConnectorX & Arrow -description: Load data from sql queries fast with connector x and arrow tables -keywords: [connector x, pyarrow, zero copy] ---- - -import Header from '../_examples-header.md'; - -
- -## Load mysql table with ConnectorX and Arrow - -Example script below takes genome data from public **mysql** instance and then loads it into **duckdb**. Mind that your destination -must support loading of parquet files as this is the format that `dlt` uses to save arrow tables. [Connector X](https://github.com/sfu-db/connector-x) allows to -get data from several popular databases and creates in memory Arrow table which `dlt` then saves to load package and loads to the destination. -:::tip -You can yield several tables if your data is large and you need to partition your load. -::: - -We'll learn: - -- How to get arrow tables from [connector X](https://github.com/sfu-db/connector-x) and yield them. -- That merge and incremental loads work with arrow tables. -- How to enable [incremental loading](../../general-usage/incremental-loading) for efficient data extraction. -- How to use build in ConnectionString credentials. - - - -### Loading code - - - - -Run the pipeline: - - - diff --git a/docs/website/docs/examples/custom_destination_bigquery/__init__.py b/docs/website/docs/examples/custom_destination_bigquery/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/config.toml b/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/config.toml deleted file mode 100644 index be627e6c11..0000000000 --- a/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -# @@@DLT_SNIPPET_START example -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/example.secrets.toml b/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/example.secrets.toml deleted file mode 100644 index 71f41f9878..0000000000 --- a/docs/website/docs/examples/custom_destination_bigquery/code/.dlt/example.secrets.toml +++ /dev/null @@ -1,10 +0,0 @@ -# @@@DLT_SNIPPET_START example -[destination.bigquery.credentials] -client_email = "" -private_key = "" -project_id = "" -token_uri = "" -refresh_token = "" -client_id = "" -client_secret = "" -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/custom_destination_bigquery/code/__init__.py b/docs/website/docs/examples/custom_destination_bigquery/code/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/custom_destination_bigquery/code/custom_destination_bigquery-snippets.py b/docs/website/docs/examples/custom_destination_bigquery/code/custom_destination_bigquery-snippets.py deleted file mode 100644 index 16ff9c22b8..0000000000 --- a/docs/website/docs/examples/custom_destination_bigquery/code/custom_destination_bigquery-snippets.py +++ /dev/null @@ -1,81 +0,0 @@ -from tests.utils import skipifgithubfork -from tests.pipeline.utils import assert_load_info - - -@skipifgithubfork -def custom_destination_biquery_snippet() -> None: - # @@@DLT_SNIPPET_START example - import dlt - import pandas as pd - import pyarrow as pa - from google.cloud import bigquery - - from dlt.common.configuration.specs import GcpServiceAccountCredentials - - # constants - OWID_DISASTERS_URL = ( - "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/" - "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/" - "Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv" - ) - # this table needs to be manually created in your gc account - # format: "your-project.your_dataset.your_table" - BIGQUERY_TABLE_ID = "chat-analytics-rasa-ci.ci_streaming_insert.natural-disasters" - - # dlt sources - @dlt.resource(name="natural_disasters") - def resource(url: str): - # load pyarrow table with pandas - table = pa.Table.from_pandas(pd.read_csv(url)) - # we add a list type column to demontrate bigquery lists - table = table.append_column( - "tags", - pa.array( - [["disasters", "earthquakes", "floods", "tsunamis"]] * len(table), - pa.list_(pa.string()), - ), - ) - # we add a struct type column to demonstrate bigquery structs - table = table.append_column( - "meta", - pa.array( - [{"loaded_by": "dlt"}] * len(table), - pa.struct([("loaded_by", pa.string())]), - ), - ) - yield table - - # dlt biquery custom destination - # we can use the dlt provided credentials class - # to retrieve the gcp credentials from the secrets - @dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0) - def bigquery_insert( - items, table, credentials: GcpServiceAccountCredentials = dlt.secrets.value - ) -> None: - client = bigquery.Client( - credentials.project_id, credentials.to_native_credentials(), location="US" - ) - job_config = bigquery.LoadJobConfig( - autodetect=True, - source_format=bigquery.SourceFormat.PARQUET, - schema_update_options=bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, - ) - # since we have set the batch_size to 0, we get a filepath and can load the file directly - with open(items, "rb") as f: - load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config) - load_job.result() # Waits for the job to complete. - - __name__ = "__main__" # @@@DLT_REMOVE - if __name__ == "__main__": - # run the pipeline and print load results - pipeline = dlt.pipeline( - pipeline_name="csv_to_bigquery_insert", - destination=bigquery_insert, - dataset_name="mydata", - full_refresh=True, - ) - load_info = pipeline.run(resource(url=OWID_DISASTERS_URL)) - - print(load_info) - # @@@DLT_SNIPPET_END example - assert_load_info(load_info) diff --git a/docs/website/docs/examples/custom_destination_bigquery/index.md b/docs/website/docs/examples/custom_destination_bigquery/index.md deleted file mode 100644 index 9161e3e0d1..0000000000 --- a/docs/website/docs/examples/custom_destination_bigquery/index.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -title: Custom destination with BigQuery -description: Learn how use the custom destination to load to bigquery and use credentials -keywords: [destination, credentials, example, bigquery, custom destination] ---- - -import Header from '../_examples-header.md'; - -
- -## Custom destination BigQuery pipeline - -In this example, you'll find a Python script that demonstrates how to load Google Sheets data using the `dlt` library. - -We'll learn how to: -- use [built-in credentials](../../general-usage/credentials/config_specs#gcp-credentials) -- use the [custom destination](../../dlt-ecosystem/destinations/destination.md) -- Use pyarrow tables to create complex column types on bigquery -- Use bigquery autodetect=True for schema inference from parquet files - -### Your bigquery credentials in secrets.toml - - -### Pipeline code - - diff --git a/docs/website/docs/examples/google_sheets/__init__.py b/docs/website/docs/examples/google_sheets/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/google_sheets/code/.dlt/config.toml b/docs/website/docs/examples/google_sheets/code/.dlt/config.toml deleted file mode 100644 index be627e6c11..0000000000 --- a/docs/website/docs/examples/google_sheets/code/.dlt/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -# @@@DLT_SNIPPET_START example -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/google_sheets/code/.dlt/example.secrets.toml b/docs/website/docs/examples/google_sheets/code/.dlt/example.secrets.toml deleted file mode 100644 index cae98dc492..0000000000 --- a/docs/website/docs/examples/google_sheets/code/.dlt/example.secrets.toml +++ /dev/null @@ -1,18 +0,0 @@ -# @@@DLT_SNIPPET_START example -# you can just paste services.json as credentials -[sources.google_sheets] -credentials=''' -{ - "type": "set me up!", - "project_id": "set me up!", - "private_key_id": "set me up!", - "private_key": "set me up!", - "client_email": "set me up!", - "client_id": "set me up!", - "auth_uri": "https://accounts.google.com/o/oauth2/auth", - "token_uri": "https://oauth2.googleapis.com/token", - "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", - "client_x509_cert_url": "set me up!" - } -''' -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/google_sheets/code/__init__.py b/docs/website/docs/examples/google_sheets/code/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/google_sheets/code/google_sheets-snippets.py b/docs/website/docs/examples/google_sheets/code/google_sheets-snippets.py deleted file mode 100644 index f56861e9e9..0000000000 --- a/docs/website/docs/examples/google_sheets/code/google_sheets-snippets.py +++ /dev/null @@ -1,88 +0,0 @@ -from tests.utils import skipifgithubfork - -__source_name__ = "google_sheets" - - -@skipifgithubfork -def google_sheets_snippet() -> None: - # @@@DLT_SNIPPET_START example - # @@@DLT_SNIPPET_START google_sheets - from typing import Any, Iterator, Sequence, Union, cast - - from googleapiclient.discovery import build - - import dlt - from dlt.common.configuration.specs import ( - GcpOAuthCredentials, - GcpServiceAccountCredentials, - ) - from dlt.common.typing import DictStrAny, StrAny - - def _initialize_sheets( - credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials] - ) -> Any: - # Build the service object. - service = build("sheets", "v4", credentials=credentials.to_native_credentials()) - return service - - @dlt.source - def google_spreadsheet( - spreadsheet_id: str, - sheet_names: Sequence[str], - credentials: Union[ - GcpServiceAccountCredentials, GcpOAuthCredentials, str, StrAny - ] = dlt.secrets.value, - ) -> Any: - sheets = _initialize_sheets(cast(GcpServiceAccountCredentials, credentials)) - - def get_sheet(sheet_name: str) -> Iterator[DictStrAny]: - # get list of list of typed values - result = ( - sheets.spreadsheets() - .values() - .get( - spreadsheetId=spreadsheet_id, - range=sheet_name, - # unformatted returns typed values - valueRenderOption="UNFORMATTED_VALUE", - # will return formatted dates - dateTimeRenderOption="FORMATTED_STRING", - ) - .execute() - ) - - # pprint.pprint(result) - values = result.get("values") - - # yield dicts assuming row 0 contains headers and following rows values and all rows have identical length - for v in values[1:]: - yield {h: v for h, v in zip(values[0], v)} - - # create resources from supplied sheet names - return [ - dlt.resource(get_sheet(name), name=name, write_disposition="replace") - for name in sheet_names - ] - - # @@@DLT_SNIPPET_END google_sheets - # @@@DLT_SNIPPET_START google_sheets_run - __name__ = "__main__" # @@@DLT_REMOVE - if __name__ == "__main__": - pipeline = dlt.pipeline(destination="duckdb") - # see example.secrets.toml to where to put credentials - sheet_id = "1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580" - range_names = ["hidden_columns_merged_cells", "Blank Columns"] - # "2022-05", "model_metadata" - info = pipeline.run( - google_spreadsheet( - spreadsheet_id=sheet_id, - sheet_names=range_names, - ) - ) - print(info) - # @@@DLT_SNIPPET_END google_sheets_run - # @@@DLT_SNIPPET_END example - row_counts = pipeline.last_trace.last_normalize_info.row_counts - print(row_counts.keys()) - assert row_counts["hidden_columns_merged_cells"] == 7 - assert row_counts["blank_columns"] == 21 diff --git a/docs/website/docs/examples/google_sheets/index.md b/docs/website/docs/examples/google_sheets/index.md deleted file mode 100644 index d14636a508..0000000000 --- a/docs/website/docs/examples/google_sheets/index.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Google Sheets minimal example -description: Learn how work with Google services -keywords: [google sheets, credentials, example] ---- - -import Header from '../_examples-header.md'; - -
- -## Google Sheets data pipeline - -In this example, you'll find a Python script that demonstrates how to load Google Sheets data using the `dlt` library. - -We'll learn how to: -- use [built-in credentials](../../general-usage/credentials/config_specs#gcp-credentials); -- use [union of credentials](../../general-usage/credentials/config_specs#working-with-alternatives-of-credentials-union-types); -- create [dynamically generated resources](../../general-usage/source#create-resources-dynamically). - -:::tip -This example is for educational purposes. For best practices, we recommend using [Google Sheets verified source](../../dlt-ecosystem/verified-sources/google_sheets.md). -::: - -### Install Google client library - -```sh - pip install google-api-python-client -``` - -### Loading code - - - - -### Run the pipeline - - - diff --git a/docs/website/docs/examples/incremental_loading/__init__.py b/docs/website/docs/examples/incremental_loading/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/incremental_loading/code/.dlt/config.toml b/docs/website/docs/examples/incremental_loading/code/.dlt/config.toml deleted file mode 100644 index be627e6c11..0000000000 --- a/docs/website/docs/examples/incremental_loading/code/.dlt/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -# @@@DLT_SNIPPET_START example -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/incremental_loading/code/.dlt/example.secrets.toml b/docs/website/docs/examples/incremental_loading/code/.dlt/example.secrets.toml deleted file mode 100644 index caf8d523c4..0000000000 --- a/docs/website/docs/examples/incremental_loading/code/.dlt/example.secrets.toml +++ /dev/null @@ -1,6 +0,0 @@ -# @@@DLT_SNIPPET_START example -[sources.zendesk.credentials] -password = "" -subdomain = "" -email = "" -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/incremental_loading/code/__init__.py b/docs/website/docs/examples/incremental_loading/code/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py deleted file mode 100644 index 05ea18cb9e..0000000000 --- a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py +++ /dev/null @@ -1,143 +0,0 @@ -from tests.utils import skipifgithubfork - -# because the example below uses credentials and it is copied to the module zendesk.py -# we force the same config section name -__source_name__ = "zendesk" - - -@skipifgithubfork -def incremental_snippet() -> None: - # @@@DLT_SNIPPET_START example - # @@@DLT_SNIPPET_START markdown_source - from typing import Optional, Dict, Any, Tuple - - import dlt - from dlt.common import pendulum - from dlt.common.time import ensure_pendulum_datetime - from dlt.common.typing import TAnyDateTime - from dlt.sources.helpers.requests import client - - @dlt.source(max_table_nesting=2) - def zendesk_support( - credentials: Dict[str, str] = dlt.secrets.value, - start_date: Optional[TAnyDateTime] = pendulum.datetime( # noqa: B008 - year=2000, month=1, day=1 - ), - end_date: Optional[TAnyDateTime] = None, - ): - """ - Retrieves data from Zendesk Support for tickets events. - - Args: - credentials: Zendesk credentials (default: dlt.secrets.value) - start_date: Start date for data extraction (default: 2000-01-01) - end_date: End date for data extraction (default: None). - If end time is not provided, the incremental loading will be - enabled, and after the initial run, only new data will be retrieved. - - Returns: - DltResource. - """ - # Convert start_date and end_date to Pendulum datetime objects - start_date_obj = ensure_pendulum_datetime(start_date) - end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None - - # Convert Pendulum datetime objects to Unix timestamps - start_date_ts = start_date_obj.int_timestamp - end_date_ts: Optional[int] = None - if end_date_obj: - end_date_ts = end_date_obj.int_timestamp - - # Extract credentials from secrets dictionary - auth = (credentials["email"], credentials["password"]) - subdomain = credentials["subdomain"] - url = f"https://{subdomain}.zendesk.com" - - # we use `append` write disposition, because objects in ticket_events endpoint are never updated - # so we do not need to merge - # we set primary_key so allow deduplication of events by the `incremental` below in the rare case - # when two events have the same timestamp - @dlt.resource(primary_key="id", write_disposition="append") - def ticket_events( - timestamp: dlt.sources.incremental[int] = dlt.sources.incremental( - "timestamp", - initial_value=start_date_ts, - end_value=end_date_ts, - allow_external_schedulers=True, - ), - ): - # URL For ticket events - # 'https://d3v-dlthub.zendesk.com/api/v2/incremental/ticket_events.json?start_time=946684800' - event_pages = get_pages( - url=url, - endpoint="/api/v2/incremental/ticket_events.json", - auth=auth, - data_point_name="ticket_events", - params={"start_time": timestamp.last_value}, - ) - for page in event_pages: - yield page - # stop loading when using end_value and end is reached. - # unfortunately, Zendesk API does not have the "end_time" parameter, so we stop iterating ourselves - if timestamp.end_out_of_range: - return - - return ticket_events - - # @@@DLT_SNIPPET_END markdown_source - - def get_pages( - url: str, - endpoint: str, - auth: Tuple[str, str], - data_point_name: str, - params: Optional[Dict[str, Any]] = None, - ): - """ - Makes a request to a paginated endpoint and returns a generator of data items per page. - - Args: - url: The base URL. - endpoint: The url to the endpoint, e.g. /api/v2/calls - auth: Credentials for authentication. - data_point_name: The key which data items are nested under in the response object (e.g. calls) - params: Optional dict of query params to include in the request. - - Returns: - Generator of pages, each page is a list of dict data items. - """ - # update the page size to enable cursor pagination - params = params or {} - params["per_page"] = 1000 - headers = None - - # make request and keep looping until there is no next page - get_url = f"{url}{endpoint}" - while get_url: - response = client.get(get_url, headers=headers, auth=auth, params=params) - response.raise_for_status() - response_json = response.json() - result = response_json[data_point_name] - yield result - - get_url = None - # See https://developer.zendesk.com/api-reference/ticketing/ticket-management/incremental_exports/#json-format - if not response_json["end_of_stream"]: - get_url = response_json["next_page"] - - # @@@DLT_SNIPPET_START markdown_pipeline - __name__ = "__main__" # @@@DLT_REMOVE - if __name__ == "__main__": - # create dlt pipeline - pipeline = dlt.pipeline( - pipeline_name="zendesk", destination="duckdb", dataset_name="zendesk_data" - ) - - load_info = pipeline.run(zendesk_support()) - print(load_info) - # @@@DLT_SNIPPET_END markdown_pipeline - # @@@DLT_SNIPPET_END example - - # check that stuff was loaded - row_counts = pipeline.last_trace.last_normalize_info.row_counts - assert row_counts["ticket_events"] == 17 diff --git a/docs/website/docs/examples/incremental_loading/index.md b/docs/website/docs/examples/incremental_loading/index.md deleted file mode 100644 index c4f6278471..0000000000 --- a/docs/website/docs/examples/incremental_loading/index.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Load Zendesk tickets incrementally -description: Learn how do incremental loading in consecutive runs -keywords: [incremental loading, example] ---- - -import Header from '../_examples-header.md'; - -
- -## Incremental loading with the Zendesk API - -In this example, you'll find a Python script that interacts with the Zendesk Support API to extract ticket events data. - -We'll learn: - -- How to pass [credentials](../../general-usage/credentials) as dict and how to type the `@dlt.source` function arguments. -- How to set [the nesting level](../../general-usage/source#reduce-the-nesting-level-of-generated-tables). -- How to enable [incremental loading](../../general-usage/incremental-loading) for efficient data extraction. -- How to specify [the start and end dates](../../general-usage/incremental-loading#using-dltsourcesincremental-for-backfill) for the data loading and how to [opt-in to Airflow scheduler](../../general-usage/incremental-loading#using-airflow-schedule-for-backfill-and-incremental-loading) by setting `allow_external_schedulers` to `True`. -- How to work with timestamps, specifically converting them to Unix timestamps for incremental data extraction. -- How to use the `start_time` parameter in API requests to retrieve data starting from a specific timestamp. - - -### Loading code - - - - -Run the pipeline: - - - - - diff --git a/docs/website/docs/examples/nested_data/__init__.py b/docs/website/docs/examples/nested_data/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/nested_data/code/.dlt/config.toml b/docs/website/docs/examples/nested_data/code/.dlt/config.toml deleted file mode 100644 index be627e6c11..0000000000 --- a/docs/website/docs/examples/nested_data/code/.dlt/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -# @@@DLT_SNIPPET_START example -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/nested_data/code/.dlt/example.secrets.toml b/docs/website/docs/examples/nested_data/code/.dlt/example.secrets.toml deleted file mode 100644 index d754142392..0000000000 --- a/docs/website/docs/examples/nested_data/code/.dlt/example.secrets.toml +++ /dev/null @@ -1,4 +0,0 @@ -# @@@DLT_SNIPPET_START example -[sources.mongodb] -connection_url="" -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/nested_data/code/__init__.py b/docs/website/docs/examples/nested_data/code/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/nested_data/code/nested_data-snippets.py b/docs/website/docs/examples/nested_data/code/nested_data-snippets.py deleted file mode 100644 index e67cf33cd0..0000000000 --- a/docs/website/docs/examples/nested_data/code/nested_data-snippets.py +++ /dev/null @@ -1,156 +0,0 @@ -from tests.utils import skipifgithubfork - -__source_name__ = "mongodb" - - -@skipifgithubfork -def nested_data_snippet() -> None: - # @@@DLT_SNIPPET_START example - # @@@DLT_SNIPPET_START nested_data - from itertools import islice - from typing import Any, Dict, Iterator, Optional - - from bson.decimal128 import Decimal128 - from bson.objectid import ObjectId - from pendulum import _datetime - from pymongo import MongoClient - - import dlt - from dlt.common.time import ensure_pendulum_datetime - from dlt.common.typing import TDataItem - from dlt.common.utils import map_nested_in_place - - CHUNK_SIZE = 10000 - - # You can limit how deep dlt goes when generating child tables. - # By default, the library will descend and generate child tables - # for all nested lists, without a limit. - # In this example, we specify that we only want to generate child tables up to level 2, - # so there will be only one level of child tables within child tables. - @dlt.source(max_table_nesting=2) - def mongodb_collection( - connection_url: str = dlt.secrets.value, - database: Optional[str] = dlt.config.value, - collection: str = dlt.config.value, - incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg] - write_disposition: Optional[str] = dlt.config.value, - ) -> Any: - # set up mongo client - client: Any = MongoClient(connection_url, uuidRepresentation="standard", tz_aware=True) - mongo_database = client.get_default_database() if not database else client[database] - collection_obj = mongo_database[collection] - - def collection_documents( - client: Any, - collection: Any, - incremental: Optional[dlt.sources.incremental[Any]] = None, - ) -> Iterator[TDataItem]: - LoaderClass = CollectionLoader - - loader = LoaderClass(client, collection, incremental=incremental) - yield from loader.load_documents() - - return dlt.resource( # type: ignore - collection_documents, - name=collection_obj.name, - primary_key="_id", - write_disposition=write_disposition, - )(client, collection_obj, incremental=incremental) - - # @@@DLT_SNIPPET_END nested_data - - class CollectionLoader: - def __init__( - self, - client: Any, - collection: Any, - incremental: Optional[dlt.sources.incremental[Any]] = None, - ) -> None: - self.client = client - self.collection = collection - self.incremental = incremental - if incremental: - self.cursor_field = incremental.cursor_path - self.last_value = incremental.last_value - else: - self.cursor_column = None - self.last_value = None - - @property - def _filter_op(self) -> Dict[str, Any]: - if not self.incremental or not self.last_value: - return {} - if self.incremental.last_value_func is max: - return {self.cursor_field: {"$gte": self.last_value}} - elif self.incremental.last_value_func is min: - return {self.cursor_field: {"$lt": self.last_value}} - return {} - - def load_documents(self) -> Iterator[TDataItem]: - cursor = self.collection.find(self._filter_op) - while docs_slice := list(islice(cursor, CHUNK_SIZE)): - yield map_nested_in_place(convert_mongo_objs, docs_slice) - - def convert_mongo_objs(value: Any) -> Any: - if isinstance(value, (ObjectId, Decimal128)): - return str(value) - if isinstance(value, _datetime.datetime): - return ensure_pendulum_datetime(value) - return value - - # @@@DLT_SNIPPET_START nested_data_run - - __name__ = "__main__" # @@@DLT_REMOVE - if __name__ == "__main__": - # When we created the source, we set max_table_nesting to 2. - # This ensures that the generated tables do not have more than two - # levels of nesting, even if the original data structure is more deeply nested. - pipeline = dlt.pipeline( - pipeline_name="mongodb_pipeline", - destination="duckdb", - dataset_name="unpacked_data", - ) - source_data = mongodb_collection(collection="movies", write_disposition="replace") - load_info = pipeline.run(source_data) - print(load_info) - tables = pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE - tables.pop("_dlt_pipeline_state") # @@@DLT_REMOVE - assert len(tables) == 7, pipeline.last_trace.last_normalize_info # @@@DLT_REMOVE - - # The second method involves setting the max_table_nesting attribute directly - # on the source data object. - # This allows for dynamic control over the maximum nesting - # level for a specific data source. - # Here the nesting level is adjusted before running the pipeline. - pipeline = dlt.pipeline( - pipeline_name="mongodb_pipeline", - destination="duckdb", - dataset_name="not_unpacked_data", - ) - source_data = mongodb_collection(collection="movies", write_disposition="replace") - source_data.max_table_nesting = 0 - load_info = pipeline.run(source_data) - print(load_info) - tables = pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE - tables.pop("_dlt_pipeline_state") # @@@DLT_REMOVE - assert len(tables) == 1, pipeline.last_trace.last_normalize_info # @@@DLT_REMOVE - - # The third method involves applying data type hints to specific columns in the data. - # In this case, we tell dlt that column 'cast' (containing a list of actors) - # in 'movies' table should have type complex which means - # that it will be loaded as JSON/struct and not as child table. - pipeline = dlt.pipeline( - pipeline_name="mongodb_pipeline", - destination="duckdb", - dataset_name="unpacked_data_without_cast", - ) - source_data = mongodb_collection(collection="movies", write_disposition="replace") - source_data.movies.apply_hints(columns={"cast": {"data_type": "complex"}}) - load_info = pipeline.run(source_data) - print(load_info) - tables = pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE - tables.pop("_dlt_pipeline_state") # @@@DLT_REMOVE - assert len(tables) == 6, pipeline.last_trace.last_normalize_info # @@@DLT_REMOVE - - # @@@DLT_SNIPPET_END nested_data_run - # @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/nested_data/index.md b/docs/website/docs/examples/nested_data/index.md deleted file mode 100644 index 0a4c83badb..0000000000 --- a/docs/website/docs/examples/nested_data/index.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: Control nested MongoDB data -description: Learn how control nested data -keywords: [incremental loading, example] ---- - -import Header from '../_examples-header.md'; - -
- -## Control nested data - -In this example, you'll find a Python script that demonstrates how to control nested data using the `dlt` library. - -We'll learn how to: -- [Adjust maximum nesting level in three ways:](../../general-usage/source#reduce-the-nesting-level-of-generated-tables) - - Limit nesting levels with dlt decorator. - - Dynamic nesting level adjustment. - - Apply data type hints. -- Work with [MongoDB](../../dlt-ecosystem/verified-sources/mongodb) in Python and `dlt`. -- Enable [incremental loading](../../general-usage/incremental-loading) for efficient data extraction. - -### Install pymongo - -```sh - pip install pymongo>=4.3.3 -``` - -### Loading code - - - - -### Run the pipeline - - - diff --git a/docs/website/docs/examples/pdf_to_weaviate/__init__.py b/docs/website/docs/examples/pdf_to_weaviate/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/pdf_to_weaviate/code/__init__.py b/docs/website/docs/examples/pdf_to_weaviate/code/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/pdf_to_weaviate/code/pdf_to_weaviate-snippets.py b/docs/website/docs/examples/pdf_to_weaviate/code/pdf_to_weaviate-snippets.py deleted file mode 100644 index ae61af3746..0000000000 --- a/docs/website/docs/examples/pdf_to_weaviate/code/pdf_to_weaviate-snippets.py +++ /dev/null @@ -1,67 +0,0 @@ -from tests.pipeline.utils import assert_load_info -from tests.utils import skipifgithubfork - - -@skipifgithubfork -def pdf_to_weaviate_snippet() -> None: - # @@@DLT_SNIPPET_START example - # @@@DLT_SNIPPET_START pdf_to_weaviate - import os - - import dlt - from dlt.destinations.impl.weaviate import weaviate_adapter - from PyPDF2 import PdfReader - - @dlt.resource(selected=False) - def list_files(folder_path: str): - folder_path = os.path.abspath(folder_path) - for filename in os.listdir(folder_path): - file_path = os.path.join(folder_path, filename) - yield { - "file_name": filename, - "file_path": file_path, - "mtime": os.path.getmtime(file_path), - } - - @dlt.transformer(primary_key="page_id", write_disposition="merge") - def pdf_to_text(file_item, separate_pages: bool = False): - if not separate_pages: - raise NotImplementedError() - # extract data from PDF page by page - reader = PdfReader(file_item["file_path"]) - for page_no in range(len(reader.pages)): - # add page content to file item - page_item = dict(file_item) - page_item["text"] = reader.pages[page_no].extract_text() - page_item["page_id"] = file_item["file_name"] + "_" + str(page_no) - yield page_item - - pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate") - - # this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf" - # (3) sends them to pdf_to_text transformer with pipe (|) operator - pdf_pipeline = list_files("assets/invoices").add_filter( - lambda item: item["file_name"].endswith(".pdf") - ) | pdf_to_text(separate_pages=True) - - # set the name of the destination table to receive pages - # NOTE: Weaviate, dlt's tables are mapped to classes - pdf_pipeline.table_name = "InvoiceText" - - # use weaviate_adapter to tell destination to vectorize "text" column - load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text")) - row_counts = pipeline.last_trace.last_normalize_info - print(row_counts) - print("------") - print(load_info) - # @@@DLT_SNIPPET_END pdf_to_weaviate - - # @@@DLT_SNIPPET_START pdf_to_weaviate_read - import weaviate - - client = weaviate.Client("http://localhost:8080") - # get text of all the invoices in InvoiceText class we just created above - print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do()) - # @@@DLT_SNIPPET_END pdf_to_weaviate_read - # @@@DLT_SNIPPET_END example - assert_load_info(load_info) diff --git a/docs/website/docs/examples/pdf_to_weaviate/index.md b/docs/website/docs/examples/pdf_to_weaviate/index.md deleted file mode 100644 index 4adb56359d..0000000000 --- a/docs/website/docs/examples/pdf_to_weaviate/index.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -title: Load PDFs to Weaviate -description: Extract text from PDF and load it into a vector database -keywords: [pdf, weaviate, vector store, vector database, ] ---- - -import Header from '../_examples-header.md'; - -
- -Additionally we'll use PyPDF2 to extract text from PDFs. Make sure you have it installed: - -```sh -pip install PyPDF2 -``` - -## Example code - - - - -We start with a simple resource that lists files in specified folder. To that we add a **filter** function that removes all files that are not pdfs. - -To parse PDFs we use [PyPDF](https://pypdf2.readthedocs.io/en/3.0.0/user/extract-text.html) and return each page from a given PDF as separate data item. - -Parsing happens in `@dlt.transformer` which receives data from `list_files` resource. It splits PDF into pages, extracts text and yields pages separately -so each PDF will correspond to many items in Weaviate `InvoiceText` class. We set the primary key and use merge disposition so if the same PDF comes twice -we'll just update the vectors, and not duplicate. - -Look how we pipe data from `list_files` resource (note that resource is deselected so we do not load raw file items to destination) into `pdf_to_text` using **|** operator. - -Just before load, the `weaviate_adapter` is used to tell `weaviate` destination which fields to vectorize. - -Now it is time to query our documents. - - - -Above we provide URL to local cluster. We also use `contextionary` to vectorize data. You may find information on our setup in links below. - -:::tip - -Change the destination to `duckdb` if you do not have access to Weaviate cluster or not able to run it locally. - -::: - -Learn more: - -- [Setup Weaviate destination - local or cluster](dlt-ecosystem/destinations/weaviate.md). -- [Connect the transformers to the resources](general-usage/resource#feeding-data-from-one-resource-into-another) -to load additional data or enrich it. -- [Transform your data before loading](general-usage/resource#customize-resources) and see some - [examples of customizations like column renames and anonymization](general-usage/customising-pipelines/renaming_columns). diff --git a/docs/website/docs/examples/qdrant_zendesk/__init__.py b/docs/website/docs/examples/qdrant_zendesk/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/qdrant_zendesk/code/.dlt/example.secrets.toml b/docs/website/docs/examples/qdrant_zendesk/code/.dlt/example.secrets.toml deleted file mode 100644 index 2fc55cc0de..0000000000 --- a/docs/website/docs/examples/qdrant_zendesk/code/.dlt/example.secrets.toml +++ /dev/null @@ -1,10 +0,0 @@ -# @@@DLT_SNIPPET_START example -[destination.qdrant.credentials] -location = "" -api_key = "" - -[sources.zendesk.credentials] -password = "" -subdomain = "" -email = "" -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/qdrant_zendesk/code/__init__.py b/docs/website/docs/examples/qdrant_zendesk/code/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/qdrant_zendesk/code/qdrant-snippets.py b/docs/website/docs/examples/qdrant_zendesk/code/qdrant-snippets.py deleted file mode 100644 index 07b44c6638..0000000000 --- a/docs/website/docs/examples/qdrant_zendesk/code/qdrant-snippets.py +++ /dev/null @@ -1,191 +0,0 @@ -from tests.utils import skipifgithubfork - -__source_name__ = "zendesk" - - -@skipifgithubfork -def qdrant_snippet(): - # @@@DLT_SNIPPET_START example - # @@@DLT_SNIPPET_START zendesk_conn - from typing import Optional, Dict, Any, Tuple - - import dlt - from dlt.common import pendulum - from dlt.common.time import ensure_pendulum_datetime - from dlt.common.typing import TAnyDateTime - from dlt.sources.helpers.requests import client - from dlt.destinations.adapters import qdrant_adapter - from qdrant_client import QdrantClient - - from dlt.common.configuration.inject import with_config - - # function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk - @dlt.source(max_table_nesting=2) - def zendesk_support( - credentials: Dict[str, str] = dlt.secrets.value, - start_date: Optional[TAnyDateTime] = pendulum.datetime( # noqa: B008 - year=2000, month=1, day=1 - ), - end_date: Optional[TAnyDateTime] = None, - ): - """ - Retrieves data from Zendesk Support for tickets events. - - Args: - credentials: Zendesk credentials (default: dlt.secrets.value) - start_date: Start date for data extraction (default: 2000-01-01) - end_date: End date for data extraction (default: None). - If end time is not provided, the incremental loading will be - enabled, and after the initial run, only new data will be retrieved. - - Returns: - DltResource. - """ - # Convert start_date and end_date to Pendulum datetime objects - start_date_obj = ensure_pendulum_datetime(start_date) - end_date_obj = ensure_pendulum_datetime(end_date) if end_date else None - - # Extract credentials from secrets dictionary - auth = (credentials["email"], credentials["password"]) - subdomain = credentials["subdomain"] - url = f"https://{subdomain}.zendesk.com" - - # we use `append` write disposition, because objects in tickets_data endpoint are never updated - # so we do not need to merge - # we set primary_key so allow deduplication of events by the `incremental` below in the rare case - # when two events have the same timestamp - @dlt.resource(primary_key="id", write_disposition="append") - def tickets_data( - updated_at: dlt.sources.incremental[pendulum.DateTime] = dlt.sources.incremental( - "updated_at", - initial_value=start_date_obj, - end_value=end_date_obj, - allow_external_schedulers=True, - ) - ): - # URL For ticket events - # 'https://d3v-dlthub.zendesk.com/api/v2/incremental/tickets_data.json?start_time=946684800' - event_pages = get_pages( - url=url, - endpoint="/api/v2/incremental/tickets", - auth=auth, - data_point_name="tickets", - params={"start_time": updated_at.last_value.int_timestamp}, - ) - for page in event_pages: - yield ([_fix_date(ticket) for ticket in page]) - - # stop loading when using end_value and end is reached. - # unfortunately, Zendesk API does not have the "end_time" parameter, so we stop iterating ourselves - if updated_at.end_out_of_range: - return - - return tickets_data - - # @@@DLT_SNIPPET_END zendesk_conn - - # helper function to fix the datetime format - def _parse_date_or_none(value: Optional[str]) -> Optional[pendulum.DateTime]: - if not value: - return None - return ensure_pendulum_datetime(value) - - # modify dates to return datetime objects instead - def _fix_date(ticket): - ticket["updated_at"] = _parse_date_or_none(ticket["updated_at"]) - ticket["created_at"] = _parse_date_or_none(ticket["created_at"]) - ticket["due_at"] = _parse_date_or_none(ticket["due_at"]) - return ticket - - # function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk - def get_pages( - url: str, - endpoint: str, - auth: Tuple[str, str], - data_point_name: str, - params: Optional[Dict[str, Any]] = None, - ): - """ - Makes a request to a paginated endpoint and returns a generator of data items per page. - - Args: - url: The base URL. - endpoint: The url to the endpoint, e.g. /api/v2/calls - auth: Credentials for authentication. - data_point_name: The key which data items are nested under in the response object (e.g. calls) - params: Optional dict of query params to include in the request. - - Returns: - Generator of pages, each page is a list of dict data items. - """ - # update the page size to enable cursor pagination - params = params or {} - params["per_page"] = 1000 - headers = None - - # make request and keep looping until there is no next page - get_url = f"{url}{endpoint}" - while get_url: - response = client.get(get_url, headers=headers, auth=auth, params=params) - response.raise_for_status() - response_json = response.json() - result = response_json[data_point_name] - yield result - - get_url = None - # See https://developer.zendesk.com/api-reference/ticketing/ticket-management/incremental_exports/#json-format - if not response_json["end_of_stream"]: - get_url = response_json["next_page"] - - # @@@DLT_SNIPPET_START main_code - __name__ = "__main__" # @@@DLT_REMOVE - if __name__ == "__main__": - # create a pipeline with an appropriate name - pipeline = dlt.pipeline( - pipeline_name="qdrant_zendesk_pipeline", - destination="qdrant", - dataset_name="zendesk_data", - ) - - # run the dlt pipeline and save info about the load process - load_info = pipeline.run( - # here we use a special function to tell Qdrant which fields to embed - qdrant_adapter( - zendesk_support(), # retrieve tickets data - embed=["subject", "description"], - ) - ) - - print(load_info) - - # @@@DLT_SNIPPET_END main_code - - # @@@DLT_SNIPPET_START declare_qdrant_client - # running the Qdrant client to connect to your Qdrant database - - @with_config(sections=("destination", "qdrant", "credentials")) - def get_qdrant_client(location=dlt.secrets.value, api_key=dlt.secrets.value): - return QdrantClient( - url=location, - api_key=api_key, - ) - - # running the Qdrant client to connect to your Qdrant database - qdrant_client = get_qdrant_client() - - # view Qdrant collections you'll find your dataset here: - print(qdrant_client.get_collections()) - # @@@DLT_SNIPPET_END declare_qdrant_client - - # @@@DLT_SNIPPET_START get_response - # query Qdrant with prompt: getting tickets info close to "cancellation" - response = qdrant_client.query( - "zendesk_data_content", # collection/dataset name with the 'content' suffix -> tickets content table - query_text=["cancel", "cancel subscription"], # prompt to search - limit=3, # limit the number of results to the nearest 3 embeddings - ) - # @@@DLT_SNIPPET_END get_response - - assert len(response) <= 3 and len(response) > 0 # @@@DLT_REMOVE - - # @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/qdrant_zendesk/index.md b/docs/website/docs/examples/qdrant_zendesk/index.md deleted file mode 100644 index a4bdaca4ea..0000000000 --- a/docs/website/docs/examples/qdrant_zendesk/index.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -title: Similarity Searching with Qdrant -description: Learn how to use the dlt source, Zendesk and dlt destination, Qdrant to conduct a similarity search on your tickets data. -keywords: [similarity search, example] ---- -import Header from '../_examples-header.md'; - -
- -This article outlines a system to map vectorized ticket data from Zendesk to Qdrant, similar to our guide on the topic concerning [Weaviate](https://dlthub.com/docs/dlt-ecosystem/destinations/qdrant). In this example, we will: -- Connect to our [Zendesk source](https://dlthub.com/docs/dlt-ecosystem/verified-sources/zendesk). -- Extract tickets data from our Zendesk source. -- [Create a dlt pipeline](https://dlthub.com/docs/walkthroughs/create-a-pipeline) with Qdrant as destination. -- Vectorize/embed the tickets data from Zendesk. -- Pass the vectorized data to be stored in Qdrant via the dlt pipeline. -- Query data that we stored in Qdrant. -- Explore the similarity search results. - -First, configure the destination credentials for [Qdrant](https://dlthub.com/docs/dlt-ecosystem/destinations/qdrant#setup-guide) and [Zendesk](https://dlthub.com/docs/walkthroughs/zendesk-weaviate#configuration) in `.dlt/secrets.toml`. - -Next, make sure you have the following dependencies installed: - -```sh -pip install qdrant-client>=1.6.9 -pip install fastembed>=0.1.1 -``` - -## Connect to Zendesk and load tickets data - - - - -## Inititating a pipeline with Qdrant - - -## Querying the data - - - - - - - -The query above gives stores the following results in the `response` variable: -```py -[QueryResponse(id='6aeacd21-b3d0-5174-97ef-5aaa59486414', embedding=None, metadata={'_dlt_id': 'Nx3wBiL29xTgaQ', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-09-01T11:19:25+00:00', 'custom_status_id': 12765028278545, 'description': 'I have been trying to cancel my subscription but the system won’t let me do it. Can you please help?', 'from_messaging_channel': False, 'generated_timestamp': 1693567167, 'group_id': 12765036328465, 'has_incidents': False, 'id': 12, 'is_public': True, 'organization_id': 12765041119505, 'raw_subject': 'Unable to Cancel Subscription', 'requester_id': 12765072569105, 'status': 'open', 'subject': 'Unable to Cancel Subscription', 'submitter_id': 12765072569105, 'tags': ['test1'], 'test_field': 'test1', 'ticket_form_id': 12765054772497, 'updated_at': '2023-09-01T11:19:25+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/12.json', 'via__channel': 'web'}, document='', score=0.89545774), - QueryResponse(id='a22189c1-70ab-5421-938b-1caae3e7d6d8', embedding=None, metadata={'_dlt_id': 'bc/xloksL89EUg', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-07-18T17:23:42+00:00', 'custom_status_id': 12765028278545, 'description': 'ABCDEF', 'from_messaging_channel': False, 'generated_timestamp': 1689701023, 'group_id': 12765036328465, 'has_incidents': False, 'id': 4, 'is_public': True, 'organization_id': 12765041119505, 'raw_subject': 'What is this ticket', 'requester_id': 12765072569105, 'status': 'open', 'subject': 'What is this ticket', 'submitter_id': 12765072569105, 'tags': ['test1'], 'test_field': 'test1', 'ticket_form_id': 12765054772497, 'updated_at': '2023-07-18T17:23:42+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/4.json', 'via__channel': 'web'}, document='', score=0.8643349), - QueryResponse(id='ce2f1c5c-41c3-56c3-a31d-2399a7a9239d', embedding=None, metadata={'_dlt_id': 'ZMuFJZo0AJxV4A', '_dlt_load_id': '1700130284.002391', 'allow_attachments': True, 'allow_channelback': False, 'assignee_id': 12765072569105, 'brand_id': 12765073054225, 'created_at': '2023-03-14T10:52:28+00:00', 'custom_status_id': 12765028278545, 'description': 'X', 'from_messaging_channel': False, 'generated_timestamp': 1696163084, 'group_id': 12765036328465, 'has_incidents': False, 'id': 2, 'is_public': True, 'priority': 'high', 'raw_subject': 'SCRUBBED', 'requester_id': 13726460510097, 'status': 'deleted', 'subject': 'SCRUBBED', 'submitter_id': 12765072569105, 'tags': [], 'ticket_form_id': 13726337882769, 'type': 'question', 'updated_at': '2023-09-01T12:10:35+00:00', 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/2.json', 'via__channel': 'web'}, document='', score=0.8467072)] -``` -To get a closer look at what the Zendesk ticket was, and how dlt dealt with it, we can index into the metadata of the first `QueryResponse` object: -```py -{'_dlt_id': 'Nx3wBiL29xTgaQ', - '_dlt_load_id': '1700130284.002391', - 'allow_attachments': True, - 'allow_channelback': False, - 'assignee_id': 12765072569105, - 'brand_id': 12765073054225, - 'created_at': '2023-09-01T11:19:25+00:00', - 'custom_status_id': 12765028278545, - 'description': 'I have been trying to cancel my subscription but the system won’t let me do it. Can you please help?', - 'from_messaging_channel': False, - 'generated_timestamp': 1693567167, - 'group_id': 12765036328465, - 'has_incidents': False, - 'id': 12, - 'is_public': True, - 'organization_id': 12765041119505, - 'raw_subject': 'Unable to Cancel Subscription', - 'requester_id': 12765072569105, - 'status': 'open', - 'subject': 'Unable to Cancel Subscription', - 'submitter_id': 12765072569105, - 'tags': ['test1'], - 'test_field': 'test1', - 'ticket_form_id': 12765054772497, - 'updated_at': '2023-09-01T11:19:25+00:00', - 'url': 'https://d3v-dlthub.zendesk.com/api/v2/tickets/12.json', - 'via__channel': 'web'} -``` \ No newline at end of file diff --git a/docs/website/docs/examples/transformers/__init__.py b/docs/website/docs/examples/transformers/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/transformers/code/.dlt/config.toml b/docs/website/docs/examples/transformers/code/.dlt/config.toml deleted file mode 100644 index 07b2c84ad4..0000000000 --- a/docs/website/docs/examples/transformers/code/.dlt/config.toml +++ /dev/null @@ -1,18 +0,0 @@ -# @@@DLT_SNIPPET_START example -[runtime] -log_level="WARNING" - -[extract] -# use 2 workers to extract sources in parallel -worker=2 -# allow 10 async items to be processed in parallel -max_parallel_items=10 - -[normalize] -# use 3 worker processes to process 3 files in parallel -workers=3 - -[load] -# have 50 concurrent load jobs -workers=50 -# @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/transformers/code/__init__.py b/docs/website/docs/examples/transformers/code/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/website/docs/examples/transformers/code/pokemon-snippets.py b/docs/website/docs/examples/transformers/code/pokemon-snippets.py deleted file mode 100644 index ff8757b94e..0000000000 --- a/docs/website/docs/examples/transformers/code/pokemon-snippets.py +++ /dev/null @@ -1,72 +0,0 @@ -def transformers_snippet() -> None: - # @@@DLT_SNIPPET_START example - import dlt - from dlt.sources.helpers import requests - - @dlt.source(max_table_nesting=2) - def source(pokemon_api_url: str): - """""" - - # note that we deselect `pokemon_list` - we do not want it to be loaded - @dlt.resource(write_disposition="replace", selected=False) - def pokemon_list(): - """Retrieve a first page of Pokemons and yield it. We do not retrieve all the pages in this example""" - yield requests.get(pokemon_api_url).json()["results"] - - # transformer that retrieves a list of objects in parallel - @dlt.transformer - def pokemon(pokemons): - """Yields details for a list of `pokemons`""" - - # @dlt.defer marks a function to be executed in parallel - # in a thread pool - @dlt.defer - def _get_pokemon(_pokemon): - return requests.get(_pokemon["url"]).json() - - # call and yield the function result normally, the @dlt.defer takes care of parallelism - for _pokemon in pokemons: - yield _get_pokemon(_pokemon) - - # a special case where just one item is retrieved in transformer - # a whole transformer may be marked for parallel execution - @dlt.transformer(parallelized=True) - def species(pokemon_details): - """Yields species details for a pokemon""" - species_data = requests.get(pokemon_details["species"]["url"]).json() - # link back to pokemon so we have a relation in loaded data - species_data["pokemon_id"] = pokemon_details["id"] - # You can return the result instead of yield since the transformer only generates one result - return species_data - - # create two simple pipelines with | operator - # 1. send list of pokemons into `pokemon` transformer to get pokemon details - # 2. send pokemon details into `species` transformer to get species details - # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once - - return (pokemon_list | pokemon, pokemon_list | pokemon | species) - - if __name__ == "__main__": - # build duck db pipeline - pipeline = dlt.pipeline( - pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" - ) - - # the pokemon_list resource does not need to be loaded - load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon")) - print(load_info) - # @@@DLT_SNIPPET_END example - - # Run without __main__ - pipeline = dlt.pipeline( - pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" - ) - - # the pokemon_list resource does not need to be loaded - load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon")) - - # test assertions - row_counts = pipeline.last_trace.last_normalize_info.row_counts - assert row_counts["pokemon"] == 20 - assert row_counts["species"] == 20 - assert "pokemon_list" not in row_counts diff --git a/docs/website/docs/examples/transformers/index.md b/docs/website/docs/examples/transformers/index.md deleted file mode 100644 index d0b397c5b7..0000000000 --- a/docs/website/docs/examples/transformers/index.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -title: Pokemon details in parallel using transformers -description: Learn how to use dlt transformers and how to speed up your loads with parallelism -keywords: [transformers, parallelism, example] ---- - -import Header from '../_examples-header.md'; - -
- - -## Using transformers with the Pokemon API - -For this example, we will be loading Pokemon data from the [PokeAPI](https://pokeapi.co/) with the help of transformers to load -Pokemon details in parallel. - -We'll learn how to: -- create 2 [transformers](../../general-usage/resource.md#feeding-data-from-one-resource-into-another) and connect them to a resource with the pipe operator `|`; -- [load these transformers in parallel](../../reference/performance.md#parallelism) using the `@dlt.defer` decorator; -- [configure parallelism](../../reference/performance.md#parallel-pipeline-config-example) in the `config.toml` file; -- deselect the main resource, so it will not be loaded into the database; -- importing and using a pre-configured `requests` library with automatic retries (`from dlt.sources.helpers import requests`). - -### Loading code - - - - - -### config.toml with examples how to configure parallelism - - diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 15c9c27512..bc8d16d05a 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -11,6 +11,19 @@ // @ts-check const fs = require('fs'); +const path = require('path'); + + +function *walkSync(dir) { + const files = fs.readdirSync(dir, { withFileTypes: true }); + for (const file of files) { + if (file.isDirectory()) { + yield* walkSync(path.join(dir, file.name)); + } else { + yield path.join(dir, file.name); + } + } +} /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ const sidebars = { @@ -273,15 +286,6 @@ const sidebars = { keywords: ['examples'], }, items: [ - 'examples/transformers/index', - 'examples/incremental_loading/index', - 'examples/connector_x_arrow/index', - 'examples/chess_production/index', - 'examples/nested_data/index', - 'examples/qdrant_zendesk/index', - 'examples/google_sheets/index', - 'examples/pdf_to_weaviate/index', - 'examples/custom_destination_bigquery/index' ], }, { @@ -309,6 +313,19 @@ const sidebars = { ] }; + +// insert examples +for (const item of sidebars.tutorialSidebar) { + if (item.label === 'Code examples') { + for (let examplePath of walkSync("./docs_processed/examples")) { + examplePath = examplePath.replace("docs_processed/", ""); + examplePath = examplePath.replace(".md", ""); + item.items.push(examplePath); + } + } +} + + // inject api reference if it exists if (fs.existsSync('./docs_processed/api_reference/sidebar.json')) { for (const item of sidebars.tutorialSidebar) { diff --git a/docs/website/tools/preprocess_docs.js b/docs/website/tools/preprocess_docs.js index 426ed295ac..edc3d5c021 100644 --- a/docs/website/tools/preprocess_docs.js +++ b/docs/website/tools/preprocess_docs.js @@ -15,10 +15,9 @@ const DOCS_EXTENSIONS = [".md", ".mdx"]; const SNIPPETS_FILE_SUFFIX = "-snippets.py" // examples settings -const EXAMPLES_SOURCE_DIR = "./docs/examples/"; -const EXAMPLES_DESTINATION_DIR = "../examples/"; -const EXAMPLES_MAIN_SNIPPET_NAME = "example"; -const EXAMPLES_CODE_SUBDIR = "/code"; +const EXAMPLES_DESTINATION_DIR = `./${MD_TARGET_DIR}examples/`; +const EXAMPLES_SOURCE_DIR = "../examples/"; +const EXAMPLES_EXCLUSIONS = [".", "_", "archive", "local_cache"] // markers const DLT_MARKER = "@@@DLT"; @@ -244,35 +243,110 @@ function preprocess_docs() { } +function trimArray(lines) { + if (lines.length == 0) { + return lines; + } + while (!lines[0].trim()) { + lines.shift(); + } + while (!lines[lines.length-1].trim()) { + lines.pop(); + } + return lines; +} + /** - * Sync examples into examples folder + * Sync examples into docs */ function syncExamples() { + + let count = 0; for (const exampleDir of listDirsSync(EXAMPLES_SOURCE_DIR)) { - const exampleName = exampleDir.split("/").slice(-1)[0]; - const exampleDestinationDir = EXAMPLES_DESTINATION_DIR + exampleName; - - // clear example destination dir - fs.rmSync(exampleDestinationDir, { recursive: true, force: true }); - // create __init__.py - fs.mkdirSync(exampleDestinationDir, { recursive: true }); - fs.writeFileSync(exampleDestinationDir + "/__init__.py", ""); - - // walk all files of example and copy to example destination - const exampleCodeDir = exampleDir + EXAMPLES_CODE_SUBDIR; - for (const fileName of walkSync(exampleCodeDir)) { - let lines = getSnippetFromFile(fileName, EXAMPLES_MAIN_SNIPPET_NAME); - if (!lines) { - continue; - } - lines = removeRemainingMarkers(lines); - // write file - const destinationFileName = exampleDestinationDir + fileName.replace(exampleCodeDir, "").replace("-snippets", ""); - fs.mkdirSync(path.dirname(destinationFileName), { recursive: true }); - fs.writeFileSync(destinationFileName, lines.join("\n")); + const exampleName = exampleDir.split("/").slice(-1)[0]; + + // exclude some folders + if (EXAMPLES_EXCLUSIONS.some(ex => exampleName.startsWith(ex))) { + continue; + } + + const exampleFile = `${EXAMPLES_SOURCE_DIR}${exampleName}/${exampleName}.py`; + const targetFileName = `${EXAMPLES_DESTINATION_DIR}/${exampleName}.md`; + const lines = fs.readFileSync(exampleFile, 'utf8').split(/\r?\n/); + + let commentCount = 0; + let headerCount = 0; + + // separate file content + const header = [] + const markdown = [] + const code = [] + + for (const line of lines) { + + // find file docstring boundaries + if (line.startsWith(`"""`)) { + commentCount += 1 + if (commentCount > 2) { + throw new Error(); + } + continue; } + + // find header boundaries + if (line.startsWith(`---`)) { + headerCount += 1; + if (headerCount > 2) { + throw new Error(); + } + continue; + } + + if (headerCount == 1) { + header.push(line); + } + else if (commentCount == 1) { + markdown.push(line) + } + else if (commentCount == 2) { + code.push(line); + } + + } + + // if there is no header, do not generate a page + if (headerCount == 0 ) { + continue; + } + + let output = []; + + output.push("---") + output = output.concat(header); + output.push("---") + + // add tip + output.push(":::info") + const url = `https://github.com/dlt-hub/dlt/tree/devel/docs/examples/${exampleName}` + output.push(`The source code for this example can be found in our repository at: `) + output.push(url); + output.push(":::") + + output.push("## About this Example") + output = output.concat(trimArray(markdown)); + + output.push("### Full source code") + output.push("```py"); + output = output.concat(trimArray(code)); + output.push("```"); + + fs.mkdirSync(path.dirname(targetFileName), { recursive: true }); + fs.writeFileSync(targetFileName, output.join("\n")); + + count += 1; } + console.log(`Synced ${count} examples`) } syncExamples();