dlt-hub · rudolfix · Oct 4, 2023 · Sep 9, 2023 · Sep 11, 2023 · Sep 11, 2023
diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml
@@ -19,7 +19,7 @@ env:
   DESTINATION__WEAVIATE__MODULE_CONFIG: "{\"text2vec-contextionary\": {\"vectorizeClassName\": false, \"vectorizePropertyName\": true}}"
 
   # zendesk vars for example
-  ZENDESK__CREDENTIALS: ${{ secrets.ZENDESK__CREDENTIALS }}
+  SOURCES__ZENDESK__CREDENTIALS: ${{ secrets.ZENDESK__CREDENTIALS }}
 
 jobs:
 

diff --git a/Makefile b/Makefile
@@ -27,6 +27,8 @@ help:
 	@echo "			tests all components unsing local destinations: duckdb and postgres"
 	@echo "		test-common"
 	@echo "			tests common components"
+	@echo "		test-and-lint-snippets"
+	@echo "			tests and lints snippets and examples in docs"
 	@echo "		build-library"
 	@echo "			makes dev and then builds dlt package for distribution"
 	@echo "		publish-library"

diff --git a/docs/examples/.dlt/secrets.toml b/docs/examples/.dlt/secrets.toml
@@ -0,0 +1,26 @@
+# here is a file with the secrets for all the example pipelines in `examples` folder
+
+[sources]
+# redshift password for query tables example
+query_table.credentials.password="8P5gyDPNo9zo582rQG6a"
+query_sql.credentials.password="8P5gyDPNo9zo582rQG6a"
+
+# google sheets example
+[sources.google_spreadsheet.credentials]
+project_id="chat-analytics-317513"
+client_email="[email protected]"
+private_key="-----BEGIN PRIVATE KEY-----\nMIIEuwIBADANBgkqhkiG9w0BAQEFAASCBKUwggShAgEAAoIBAQCNEN0bL39HmD+S\n7inCg8CdRKEMZ/q7Rv5uUiTyUMjQLNXySOPRSSJBSXBPpLJPbcmfxCYgOPWadA3F\noa54WJFR3Uxd+SjAC848dGz5+JEL5u2rHcjzL1IbjDd5oH9rap/QxYm/R9Q5eSdC\nlGiiFh4zH+U9nhWWUovn+ofixbQkhrMFOfgHt+Jvdh/2m7Sdz47itjWFC258R1Ki\nH9vPVtHxw0LrcUZN7HACV3NICRUkvX8U2+JC25esmPxc/qqmxoFlQ7ono/NeMjQa\nq2TyTyNSh4pDtf30PAW4cU2AUbtaSPhIdRloDuwzW9d88VUUbVelqHHeJRgGjkSX\nQz2MCBuFAgMBAAECgf8zlepWYEgm8xtdim2ydB3mdhR/zoZmZM/Q1NthJ8u/IbdO\nb2HPEXxGbDKIIJzhAA17Un98leBLwYKuLZhOpdB+igyJlTG8XlCRF88XiUosJWR3\niHmiuMkndHA7WyTXDc0n3GpUFYWkGGng2cKLx7V/OFmpMwhC9LEKMNOrBKnf9U6Z\n/9nanIerFZU4m5mWbNW/ZRc+qvd+1zGw/JYM6ntdkKLo/TwNOmOS5FS01yLvx7Xw\nm12f9I3VceGXWyrYEh+UCWk0gsEb8xnGGZKy3op5I6trsXzH8I3HCXvapkeWSaFe\n/gmT3CHZIK9hang6f4yMG+niuNtZE2/METgvcjkCgYEAwTg1SZAYSaL6LoFV92Kq\nyHV0DP8KivDIKrByOym9oikPK/2ZMNi9NivVmSALuR54wj7pFxFmyEj6UTklSeLb\nRvOjcPnZEMbFspRHIzkySfsnfScoHZXOeakjOub1K5FehYsLXQIfe7iwRg/mcd/2\noFVyJrm2aNXcvNuug4scEE0CgYEAuuaRmGY5mKv+viuZ/zzOky7IjDnp4w2BMJt0\noMXznKuLHJpexnQ9A/ZHxpAp6Bi6Glk0XLi2uaI+ggXlEUfNa3DHMQu7xg1RaCqN\n957WGRO0ETtIWdp1BHhWPtT5kdOrjSZy9vRSZ0vh2WnZe5SgKRVCqQsV7ExcEltz\nUc9WlBkCgYA9MaQOzEgk6iz6FZQ4aVNVcX1zsEKShneerYtAGZQpi392mzatNbeX\nNILNoEyWMIRmYK5J1AUNYa+FkeexYtu3uOoGmdqZaZqrWDK/gRngPF7hUElwNUXT\nWjICMatsRPn+qW7L4iQ+dtu9FMQTRK9DUEx6305aHYFvftPibWhR8QKBgQCAd3GG\nNmXKihaMsr2kUjCPvG1+7WPVfHfbaE9PHyFnBAaXv4f7kvRJn+QQGRGlBjINYFl8\njj6S9HFQwCqGqTsKabeQ/8auyIK3PeDdXqE9FW0FFyGRGXarfueRQqTU1pCpcc89\n7gwiEmeIIJiruCoqcwGh3gvQo1/6AkAO8JxLKQKBgF0T8P0hRctXFejcFf/4EikS\n2+WA/gNSQITC1m+8nWNnU+bDmRax+pIkzlvjkG5kyNfWvB7i2A5Y5OnCo92y5aDQ\nzbGHLwZj0HXqLFXhbAv/0xZPXlZ71NWpi2BpCJRnzU65ftsjePfydfvN6g4mPQ28\nkHQsYKUZk5HPC8FlPvQe\n-----END PRIVATE KEY-----\n"
+
+[destination]
+# all postgres destinations for all examples
+postgres.credentials = "postgres://loader:loader@localhost:5432/dlt_data"
+# all redshift destinations for all examples
+redshift.credentials = "postgres://loader:8P5gyDPNo9zo582rQG6a@chat-analytics.czwteevq7bpe.eu-central-1.redshift.amazonaws.com:5439/chat_analytics_rasa"
+
+# all the bigquery destinations
+[destination.bigquery.credentials]
+project_id="chat-analytics-317513"
+client_email="[email protected]"
+private_key="-----BEGIN PRIVATE KEY-----\nMIIEuwIBADANBgkqhkiG9w0BAQEFAASCBKUwggShAgEAAoIBAQCNEN0bL39HmD+S\n7inCg8CdRKEMZ/q7Rv5uUiTyUMjQLNXySOPRSSJBSXBPpLJPbcmfxCYgOPWadA3F\noa54WJFR3Uxd+SjAC848dGz5+JEL5u2rHcjzL1IbjDd5oH9rap/QxYm/R9Q5eSdC\nlGiiFh4zH+U9nhWWUovn+ofixbQkhrMFOfgHt+Jvdh/2m7Sdz47itjWFC258R1Ki\nH9vPVtHxw0LrcUZN7HACV3NICRUkvX8U2+JC25esmPxc/qqmxoFlQ7ono/NeMjQa\nq2TyTyNSh4pDtf30PAW4cU2AUbtaSPhIdRloDuwzW9d88VUUbVelqHHeJRgGjkSX\nQz2MCBuFAgMBAAECgf8zlepWYEgm8xtdim2ydB3mdhR/zoZmZM/Q1NthJ8u/IbdO\nb2HPEXxGbDKIIJzhAA17Un98leBLwYKuLZhOpdB+igyJlTG8XlCRF88XiUosJWR3\niHmiuMkndHA7WyTXDc0n3GpUFYWkGGng2cKLx7V/OFmpMwhC9LEKMNOrBKnf9U6Z\n/9nanIerFZU4m5mWbNW/ZRc+qvd+1zGw/JYM6ntdkKLo/TwNOmOS5FS01yLvx7Xw\nm12f9I3VceGXWyrYEh+UCWk0gsEb8xnGGZKy3op5I6trsXzH8I3HCXvapkeWSaFe\n/gmT3CHZIK9hang6f4yMG+niuNtZE2/METgvcjkCgYEAwTg1SZAYSaL6LoFV92Kq\nyHV0DP8KivDIKrByOym9oikPK/2ZMNi9NivVmSALuR54wj7pFxFmyEj6UTklSeLb\nRvOjcPnZEMbFspRHIzkySfsnfScoHZXOeakjOub1K5FehYsLXQIfe7iwRg/mcd/2\noFVyJrm2aNXcvNuug4scEE0CgYEAuuaRmGY5mKv+viuZ/zzOky7IjDnp4w2BMJt0\noMXznKuLHJpexnQ9A/ZHxpAp6Bi6Glk0XLi2uaI+ggXlEUfNa3DHMQu7xg1RaCqN\n957WGRO0ETtIWdp1BHhWPtT5kdOrjSZy9vRSZ0vh2WnZe5SgKRVCqQsV7ExcEltz\nUc9WlBkCgYA9MaQOzEgk6iz6FZQ4aVNVcX1zsEKShneerYtAGZQpi392mzatNbeX\nNILNoEyWMIRmYK5J1AUNYa+FkeexYtu3uOoGmdqZaZqrWDK/gRngPF7hUElwNUXT\nWjICMatsRPn+qW7L4iQ+dtu9FMQTRK9DUEx6305aHYFvftPibWhR8QKBgQCAd3GG\nNmXKihaMsr2kUjCPvG1+7WPVfHfbaE9PHyFnBAaXv4f7kvRJn+QQGRGlBjINYFl8\njj6S9HFQwCqGqTsKabeQ/8auyIK3PeDdXqE9FW0FFyGRGXarfueRQqTU1pCpcc89\n7gwiEmeIIJiruCoqcwGh3gvQo1/6AkAO8JxLKQKBgF0T8P0hRctXFejcFf/4EikS\n2+WA/gNSQITC1m+8nWNnU+bDmRax+pIkzlvjkG5kyNfWvB7i2A5Y5OnCo92y5aDQ\nzbGHLwZj0HXqLFXhbAv/0xZPXlZ71NWpi2BpCJRnzU65ftsjePfydfvN6g4mPQ28\nkHQsYKUZk5HPC8FlPvQe\n-----END PRIVATE KEY-----\n"
+
+
diff --git a/docs/examples/incremental_loading/.dlt/secrets.toml b/docs/examples/incremental_loading/.dlt/secrets.toml
@@ -1,4 +1,4 @@
-[sources.zendesk_support.credentials]
+[sources.zendesk.credentials]
 password = ""
 subdomain = ""
 token = ""

diff --git a/docs/examples/transformers/pokemon.py b/docs/examples/transformers/pokemon.py
@@ -1,56 +1,54 @@
-from typing import Sequence, Iterable
 import dlt
-from dlt.common.typing import TDataItem
-from dlt.extract.source import DltResource
 from dlt.sources.helpers import requests
 
-# constants
-POKEMON_URL = "https://pokeapi.co/api/v2/pokemon"
-
-# retrieve pokemon list
-@dlt.resource(write_disposition="replace", selected=False)
-def pokemon_list() -> Iterable[TDataItem]:
-    """
-    Returns an iterator of pokemon
-    Yields:
-        dict: The pokemon list data.
-    """
-    yield from requests.get(POKEMON_URL).json()["results"]
-
-# asynchronously retrieve details for each pokemon in the list
-@dlt.transformer()
-@dlt.defer
-def pokemon(pokemon: TDataItem):
-    """
-    Returns an iterator of pokemon deatils
-    Yields:
-        dict: The pokemon full data.
-    """
-    # just return the results, if you yield,
-    # generator will be evaluated in main thread
-    return requests.get(pokemon["url"]).json()
-
-
-# asynchronously retrieve details for the species of each pokemon
-@dlt.transformer()
-@dlt.defer
-def species(pokemon: TDataItem):
-    """
-    Returns an iterator of species details for each pokemon
-    Yields:
-        dict: The species full data.
-    """
-    # just return the results, if you yield,
-    # generator will be evaluated in main thread
-    species_data = requests.get(pokemon["species"]["url"]).json()
-    # optionally add pokemon_id to result json, to later be able
-    # to join tables
-    species_data["pokemon_id"] = pokemon["id"]
-    return species_data
-
-@dlt.source
-def source():
-    return [pokemon_list | pokemon, pokemon_list | pokemon | species]
+
+@dlt.source(max_table_nesting=2)
+def source(pokemon_api_url: str):
+    """"""
+
+    # note that we deselect `pokemon_list` - we do not want it to be loaded
+    @dlt.resource(write_disposition="replace", selected=False)
+    def pokemon_list():
+        """Retrieve a first page of Pokemons and yield it. We do not retrieve all the pages in this example"""
+        yield requests.get(pokemon_api_url).json()["results"]
+
+    # transformer that retrieves a list of objects in parallel
+    @dlt.transformer
+    def pokemon(pokemons):
+        """Yields details for a list of `pokemons`"""
+
+        # @dlt.defer marks a function to be executed in parallel
+        # in a thread pool
+        @dlt.defer
+        def _get_pokemon(_pokemon):
+            return requests.get(_pokemon["url"]).json()
+
+        # call and yield the function result normally, the @dlt.defer takes care of parallelism
+        for _pokemon in pokemons:
+            yield _get_pokemon(_pokemon)
+
+    # a special case where just one item is retrieved in transformer
+    # a whole transformer may be marked for parallel execution
+    @dlt.transformer
+    @dlt.defer
+    def species(pokemon_details):
+        """Yields species details for a pokemon"""
+        species_data = requests.get(pokemon_details["species"]["url"]).json()
+        # link back to pokemon so we have a relation in loaded data
+        species_data["pokemon_id"] = pokemon_details["id"]
+        # just return the results, if you yield,
+        # generator will be evaluated in main thread
+        return species_data
+
+    # create two simple pipelines with | operator
+    # 1. send list of pokemons into `pokemon` transformer to get pokemon details
+    # 2. send pokemon details into `species` transformer to get species details
+    # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once
+
+    return (
+        pokemon_list | pokemon,
+        pokemon_list | pokemon | species
+    )
 
 if __name__ == "__main__":
     # build duck db pipeline
@@ -59,5 +57,5 @@ def source():
     )
 
     # the pokemon_list resource does not need to be loaded
-    load_info = pipeline.run(source())
+    load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon"))
     print(load_info)
diff --git a/docs/examples/transformers_and_parallelism/__init__.py b/docs/examples/transformers_and_parallelism/__init__.py
diff --git a/docs/website/docs/examples/incremental_loading/code/.dlt/secrets.toml b/docs/website/docs/examples/incremental_loading/code/.dlt/secrets.toml
@@ -1,5 +1,5 @@
 # @@@DLT_SNIPPET_START example
-[sources.zendesk_support.credentials]
+[sources.zendesk.credentials]
 password = ""
 subdomain = ""
 token = ""

diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py
@@ -1,5 +1,11 @@
+from tests.utils import skipifgithubfork
 
+# because the example below uses credentials and it is copied to the module zendesk.py
+# we force the same config section name
+__source_name__ = "zendesk"
 
+
+@skipifgithubfork
 def incremental_snippet() -> None:
 
     # @@@DLT_SNIPPET_START example

diff --git a/docs/website/docs/examples/incremental_loading/index.md b/docs/website/docs/examples/incremental_loading/index.md
@@ -19,7 +19,7 @@ In this example, you'll find a Python script that interacts with the Zendesk Sup
 
 We'll learn:
 
-- How to pass [credentials](../../general-usage/credentials) as dict.
+- How to pass [credentials](../../general-usage/credentials) as dict
 - How to set [the nesting level](../../general-usage/source#reduce-the-nesting-level-of-generated-tables).
 - How to enable [incremental loading](../../general-usage/incremental-loading) for efficient data extraction.
 - How to specify [the start and end dates](../../general-usage/incremental-loading#using-dltsourcesincremental-for-backfill) for the data loading and how to [opt-in to Airflow scheduler](../../general-usage/incremental-loading#using-airflow-schedule-for-backfill-and-incremental-loading) by setting `allow_external_schedulers` to `True`.

diff --git a/docs/website/docs/examples/transformers/code/pokemon-snippets.py b/docs/website/docs/examples/transformers/code/pokemon-snippets.py
@@ -2,59 +2,57 @@
 def transformers_snippet() -> None:
 
     # @@@DLT_SNIPPET_START example
-    from typing import Sequence, Iterable
     import dlt
-    from dlt.common.typing import TDataItem
-    from dlt.extract.source import DltResource
     from dlt.sources.helpers import requests
 
-    # constants
-    POKEMON_URL = "https://pokeapi.co/api/v2/pokemon"
 
-    # retrieve pokemon list
-    @dlt.resource(write_disposition="replace", selected=False)
-    def pokemon_list() -> Iterable[TDataItem]:
-        """
-        Returns an iterator of pokemon
-        Yields:
-            dict: The pokemon list data.
-        """
-        yield from requests.get(POKEMON_URL).json()["results"]
+    @dlt.source(max_table_nesting=2)
+    def source(pokemon_api_url: str):
+        """"""
 
-    # asynchronously retrieve details for each pokemon in the list
-    @dlt.transformer()
-    @dlt.defer
-    def pokemon(pokemon: TDataItem):
-        """
-        Returns an iterator of pokemon deatils
-        Yields:
-            dict: The pokemon full data.
-        """
-        # just return the results, if you yield,
-        # generator will be evaluated in main thread
-        return requests.get(pokemon["url"]).json()
+        # note that we deselect `pokemon_list` - we do not want it to be loaded
+        @dlt.resource(write_disposition="replace", selected=False)
+        def pokemon_list():
+            """Retrieve a first page of Pokemons and yield it. We do not retrieve all the pages in this example"""
+            yield requests.get(pokemon_api_url).json()["results"]
 
+        # transformer that retrieves a list of objects in parallel
+        @dlt.transformer
+        def pokemon(pokemons):
+            """Yields details for a list of `pokemons`"""
 
-    # asynchronously retrieve details for the species of each pokemon
-    @dlt.transformer()
-    @dlt.defer
-    def species(pokemon: TDataItem):
-        """
-        Returns an iterator of species details for each pokemon
-        Yields:
-            dict: The species full data.
-        """
-        # just return the results, if you yield,
-        # generator will be evaluated in main thread
-        species_data = requests.get(pokemon["species"]["url"]).json()
-        # optionally add pokemon_id to result json, to later be able
-        # to join tables
-        species_data["pokemon_id"] = pokemon["id"]
-        return species_data
-
-    @dlt.source
-    def source():
-        return [pokemon_list | pokemon, pokemon_list | pokemon | species]
+            # @dlt.defer marks a function to be executed in parallel
+            # in a thread pool
+            @dlt.defer
+            def _get_pokemon(_pokemon):
+                return requests.get(_pokemon["url"]).json()
+
+            # call and yield the function result normally, the @dlt.defer takes care of parallelism
+            for _pokemon in pokemons:
+                yield _get_pokemon(_pokemon)
+
+        # a special case where just one item is retrieved in transformer
+        # a whole transformer may be marked for parallel execution
+        @dlt.transformer
+        @dlt.defer
+        def species(pokemon_details):
+            """Yields species details for a pokemon"""
+            species_data = requests.get(pokemon_details["species"]["url"]).json()
+            # link back to pokemon so we have a relation in loaded data
+            species_data["pokemon_id"] = pokemon_details["id"]
+            # just return the results, if you yield,
+            # generator will be evaluated in main thread
+            return species_data
+
+        # create two simple pipelines with | operator
+        # 1. send list of pokemons into `pokemon` transformer to get pokemon details
+        # 2. send pokemon details into `species` transformer to get species details
+        # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once
+
+        return (
+            pokemon_list | pokemon,
+            pokemon_list | pokemon | species
+        )
 
     __name__ = "__main__" # @@@DLT_REMOVE
     if __name__ == "__main__":
@@ -64,7 +62,7 @@ def source():
         )
 
         # the pokemon_list resource does not need to be loaded
-        load_info = pipeline.run(source())
+        load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon"))
         print(load_info)
     # @@@DLT_SNIPPET_END example