From 4301dc0f2a23dd3c3b601f2aaa728dae8f9540bd Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 27 Sep 2023 13:48:06 +0200 Subject: [PATCH] some work on the transformer example --- docs/examples/transformers/.dlt/config.toml | 3 + .../__init__.py | 0 docs/examples/transformers/run.py | 56 ++++++++++++++ .../.dlt/config.toml | 5 -- .../transformers_and_parallelism/run.py | 46 ----------- .../__init__.py | 0 .../code/.dlt/config.toml | 2 - .../code/__init__.py | 0 .../transformers/code/run-snippets.py | 67 ++++++++++++++++ .../docs/examples/transformers/index.md | 76 +++++++++++++++++++ .../code/run-snippets.py | 55 -------------- .../transformers_and_parallelism/index.md | 57 -------------- docs/website/docusaurus.config.js | 7 -- docs/website/sidebars.js | 2 +- 14 files changed, 203 insertions(+), 173 deletions(-) create mode 100644 docs/examples/transformers/.dlt/config.toml rename docs/examples/{transformers_and_parallelism => transformers}/__init__.py (100%) create mode 100644 docs/examples/transformers/run.py delete mode 100644 docs/examples/transformers_and_parallelism/.dlt/config.toml delete mode 100644 docs/examples/transformers_and_parallelism/run.py rename docs/website/docs/examples/{transformers_and_parallelism => transformers}/__init__.py (100%) rename docs/website/docs/examples/{transformers_and_parallelism => transformers}/code/.dlt/config.toml (61%) rename docs/website/docs/examples/{transformers_and_parallelism => transformers}/code/__init__.py (100%) create mode 100644 docs/website/docs/examples/transformers/code/run-snippets.py create mode 100644 docs/website/docs/examples/transformers/index.md delete mode 100644 docs/website/docs/examples/transformers_and_parallelism/code/run-snippets.py delete mode 100644 docs/website/docs/examples/transformers_and_parallelism/index.md diff --git a/docs/examples/transformers/.dlt/config.toml b/docs/examples/transformers/.dlt/config.toml new file mode 100644 index 0000000000..9e39d8e277 --- /dev/null +++ b/docs/examples/transformers/.dlt/config.toml @@ -0,0 +1,3 @@ + +[runtime] +log_level="WARNING" \ No newline at end of file diff --git a/docs/examples/transformers_and_parallelism/__init__.py b/docs/examples/transformers/__init__.py similarity index 100% rename from docs/examples/transformers_and_parallelism/__init__.py rename to docs/examples/transformers/__init__.py diff --git a/docs/examples/transformers/run.py b/docs/examples/transformers/run.py new file mode 100644 index 0000000000..3e8125d326 --- /dev/null +++ b/docs/examples/transformers/run.py @@ -0,0 +1,56 @@ +from typing import Sequence, Iterable +import dlt +from dlt.common.typing import TDataItem +from dlt.extract.source import DltResource +from dlt.sources.helpers import requests + +# constants +POKEMON_URL = "https://pokeapi.co/api/v2/pokemon" + +# retrieve pokemon list +@dlt.resource(write_disposition="replace") +def pokemon_list() -> Iterable[TDataItem]: + """ + Returns an iterator of pokemon + Yields: + dict: The pokemon list data. + """ + yield from requests.get(POKEMON_URL).json()["results"] + +# asynchronously retrieve details for each pokemon in the list +@dlt.transformer(data_from=pokemon_list) +async def pokemon(pokemon: TDataItem): + """ + Returns an iterator of pokemon deatils + Yields: + dict: The pokemon full data. + """ + # just return the results, if you yield, + # generator will be evaluated in main thread + return requests.get(pokemon["url"]).json() + + +# asynchronously retrieve details for the species of each pokemon +@dlt.transformer(data_from=pokemon) +async def species(pokemon: TDataItem): + """ + Returns an iterator of species details for each pokemon + Yields: + dict: The species full data. + """ + # just return the results, if you yield, + # generator will be evaluated in main thread + species_data = requests.get(pokemon["species"]["url"]).json() + # optionally add pokemon_id to result json + species_data["pokemon_id"] = pokemon["id"] + return species_data + + +# build duck db pipeline +pipeline = dlt.pipeline( + pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" +) + +# the pokemon_list resource does not need to be loaded +load_info = pipeline.run([pokemon(), species()]) +print(load_info) \ No newline at end of file diff --git a/docs/examples/transformers_and_parallelism/.dlt/config.toml b/docs/examples/transformers_and_parallelism/.dlt/config.toml deleted file mode 100644 index 922a8e065a..0000000000 --- a/docs/examples/transformers_and_parallelism/.dlt/config.toml +++ /dev/null @@ -1,5 +0,0 @@ -pokemon_url="https://pokeapi.co/api/v2/pokemon" -berry_url="https://pokeapi.co/api/v2/berry" - -[runtime] -log_level="WARNING" \ No newline at end of file diff --git a/docs/examples/transformers_and_parallelism/run.py b/docs/examples/transformers_and_parallelism/run.py deleted file mode 100644 index 679e216b3e..0000000000 --- a/docs/examples/transformers_and_parallelism/run.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -This source provides data extraction from an example source as a starting point for new pipelines. -Available resources: [berries, pokemon] -""" - -from typing import Sequence, Iterable -import dlt -from dlt.common.typing import TDataItem -from dlt.extract.source import DltResource -from dlt.sources.helpers import requests - - -@dlt.resource(write_disposition="replace") -def berries(berry_url: str) -> Iterable[TDataItem]: - """ - Returns a list of berries. - Yields: - dict: The berries data. - """ - yield requests.get(berry_url).json()["results"] - - -@dlt.resource(write_disposition="replace") -def pokemon(pokemon_url: str) -> Iterable[TDataItem]: - """ - Returns a list of pokemon. - Yields: - dict: The pokemon data. - """ - yield requests.get(pokemon_url).json()["results"] - - -@dlt.source -def source(pokemon_url: str = dlt.config.value, berry_url: str = dlt.config.value) -> Sequence[DltResource]: - """ - The source function that returns all availble resources. - Returns: - Sequence[DltResource]: A sequence of DltResource objects containing the fetched data. - """ - return [berries(berry_url), pokemon(pokemon_url)] - -pipeline = dlt.pipeline( - pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" -) -load_info = pipeline.run(source()) -print(load_info) \ No newline at end of file diff --git a/docs/website/docs/examples/transformers_and_parallelism/__init__.py b/docs/website/docs/examples/transformers/__init__.py similarity index 100% rename from docs/website/docs/examples/transformers_and_parallelism/__init__.py rename to docs/website/docs/examples/transformers/__init__.py diff --git a/docs/website/docs/examples/transformers_and_parallelism/code/.dlt/config.toml b/docs/website/docs/examples/transformers/code/.dlt/config.toml similarity index 61% rename from docs/website/docs/examples/transformers_and_parallelism/code/.dlt/config.toml rename to docs/website/docs/examples/transformers/code/.dlt/config.toml index 0864af1376..07a6d670af 100644 --- a/docs/website/docs/examples/transformers_and_parallelism/code/.dlt/config.toml +++ b/docs/website/docs/examples/transformers/code/.dlt/config.toml @@ -1,7 +1,5 @@ # @@@DLT_SNIPPET_START example # @@@DLT_SNIPPET_START toml -pokemon_url="https://pokeapi.co/api/v2/pokemon" -berry_url="https://pokeapi.co/api/v2/berry" # @@@DLT_SNIPPET_END toml [runtime] diff --git a/docs/website/docs/examples/transformers_and_parallelism/code/__init__.py b/docs/website/docs/examples/transformers/code/__init__.py similarity index 100% rename from docs/website/docs/examples/transformers_and_parallelism/code/__init__.py rename to docs/website/docs/examples/transformers/code/__init__.py diff --git a/docs/website/docs/examples/transformers/code/run-snippets.py b/docs/website/docs/examples/transformers/code/run-snippets.py new file mode 100644 index 0000000000..d2f5370224 --- /dev/null +++ b/docs/website/docs/examples/transformers/code/run-snippets.py @@ -0,0 +1,67 @@ + +def transformers_snippet() -> None: + + # @@@DLT_SNIPPET_START example + from typing import Sequence, Iterable + import dlt + from dlt.common.typing import TDataItem + from dlt.extract.source import DltResource + from dlt.sources.helpers import requests + + # constants + POKEMON_URL = "https://pokeapi.co/api/v2/pokemon" + + # retrieve pokemon list + @dlt.resource(write_disposition="replace") + def pokemon_list() -> Iterable[TDataItem]: + """ + Returns an iterator of pokemon + Yields: + dict: The pokemon list data. + """ + yield from requests.get(POKEMON_URL).json()["results"] + + # asynchronously retrieve details for each pokemon in the list + @dlt.transformer(data_from=pokemon_list) + async def pokemon(pokemon: TDataItem): + """ + Returns an iterator of pokemon deatils + Yields: + dict: The pokemon full data. + """ + # just return the results, if you yield, + # generator will be evaluated in main thread + return requests.get(pokemon["url"]).json() + + + # asynchronously retrieve details for the species of each pokemon + @dlt.transformer(data_from=pokemon) + async def species(pokemon: TDataItem): + """ + Returns an iterator of species details for each pokemon + Yields: + dict: The species full data. + """ + # just return the results, if you yield, + # generator will be evaluated in main thread + species_data = requests.get(pokemon["species"]["url"]).json() + # optionally add pokemon_id to result json + species_data["pokemon_id"] = pokemon["id"] + return species_data + + + # build duck db pipeline + pipeline = dlt.pipeline( + pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" + ) + + # the pokemon_list resource does not need to be loaded + load_info = pipeline.run([pokemon(), species()]) + print(load_info) + # @@@DLT_SNIPPET_END example + + # test assertions + row_counts = pipeline.last_trace.last_normalize_info.row_counts + assert row_counts["pokemon"] == 20 + assert row_counts["species"] == 20 + assert "pokemon_list" not in row_counts diff --git a/docs/website/docs/examples/transformers/index.md b/docs/website/docs/examples/transformers/index.md new file mode 100644 index 0000000000..c947eeac27 --- /dev/null +++ b/docs/website/docs/examples/transformers/index.md @@ -0,0 +1,76 @@ +--- +title: Enriching loaded data with transformers +description: Learn how to use dlt transformers and how to speed up your loads with parallelism +keywords: [transformers, parallelism, example] +--- + +import Header from '../_examples-header.md'; + +
+ + +## Use transformers + +```py +from typing import Sequence, Iterable +import dlt +from dlt.common.typing import TDataItem +from dlt.extract.source import DltResource +from dlt.sources.helpers import requests + +# constants +POKEMON_URL = "https://pokeapi.co/api/v2/pokemon" + +# retrieve pokemon list +@dlt.resource(write_disposition="replace") +def pokemon_list() -> Iterable[TDataItem]: + """ + Returns an iterator of pokemon + Yields: + dict: The pokemon list data. + """ + yield from requests.get(POKEMON_URL).json()["results"] + +# asynchronously retrieve details for each pokemon in the list +@dlt.transformer(data_from=pokemon_list) +async def pokemon(pokemon: TDataItem): + """ + Returns an iterator of pokemon deatils + Yields: + dict: The pokemon full data. + """ + # just return the results, if you yield, + # generator will be evaluated in main thread + return requests.get(pokemon["url"]).json() + + +# asynchronously retrieve details for the species of each pokemon +@dlt.transformer(data_from=pokemon) +async def species(pokemon: TDataItem): + """ + Returns an iterator of species details for each pokemon + Yields: + dict: The species full data. + """ + # just return the results, if you yield, + # generator will be evaluated in main thread + species_data = requests.get(pokemon["species"]["url"]).json() + # optionally add pokemon_id to result json + species_data["pokemon_id"] = pokemon["id"] + return species_data + + +# build duck db pipeline +pipeline = dlt.pipeline( + pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" +) + +# the pokemon_list resource does not need to be loaded +load_info = pipeline.run([pokemon(), species()]) +print(load_info) +``` + diff --git a/docs/website/docs/examples/transformers_and_parallelism/code/run-snippets.py b/docs/website/docs/examples/transformers_and_parallelism/code/run-snippets.py deleted file mode 100644 index f1bfdc2456..0000000000 --- a/docs/website/docs/examples/transformers_and_parallelism/code/run-snippets.py +++ /dev/null @@ -1,55 +0,0 @@ - -def transformers_snippet() -> None: - # @@@DLT_SNIPPET_START example - - """ - This source provides data extraction from an example source as a starting point for new pipelines. - Available resources: [berries, pokemon] - """ - - from typing import Sequence, Iterable - import dlt - from dlt.common.typing import TDataItem - from dlt.extract.source import DltResource - from dlt.sources.helpers import requests - - - # @@@DLT_SNIPPET_START snippet1 - @dlt.resource(write_disposition="replace") - def berries(berry_url: str) -> Iterable[TDataItem]: - """ - Returns a list of berries. - Yields: - dict: The berries data. - """ - yield requests.get(berry_url).json()["results"] - # @@@DLT_SNIPPET_END snippet1 - - - # @@@DLT_SNIPPET_START snippet2 - @dlt.resource(write_disposition="replace") - def pokemon(pokemon_url: str) -> Iterable[TDataItem]: - """ - Returns a list of pokemon. - Yields: - dict: The pokemon data. - """ - yield requests.get(pokemon_url).json()["results"] - # @@@DLT_SNIPPET_END snippet2 - - - @dlt.source - def source(pokemon_url: str = dlt.config.value, berry_url: str = dlt.config.value) -> Sequence[DltResource]: - """ - The source function that returns all availble resources. - Returns: - Sequence[DltResource]: A sequence of DltResource objects containing the fetched data. - """ - return [berries(berry_url), pokemon(pokemon_url)] - - pipeline = dlt.pipeline( - pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" - ) - load_info = pipeline.run(source()) - print(load_info) - # @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/transformers_and_parallelism/index.md b/docs/website/docs/examples/transformers_and_parallelism/index.md deleted file mode 100644 index 1eee4a944d..0000000000 --- a/docs/website/docs/examples/transformers_and_parallelism/index.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: Transformers and Parallelism -description: Learn how to use dlt transformers and how to speed up your loads with parallelism -keywords: [transformers, parallelism, example] ---- - -import Header from '../_examples-header.md'; - -
- - -## Use transformers - -Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. - -### Toml File -Labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. - -```toml -pokemon_url="https://pokeapi.co/api/v2/pokemon" -berry_url="https://pokeapi.co/api/v2/berry" -``` - - - -### Python files -Do this to use transformers - -```py -@dlt.resource(write_disposition="replace") -def berries(berry_url: str) -> Iterable[TDataItem]: - """ - Returns a list of berries. - Yields: - dict: The berries data. - """ - yield requests.get(berry_url).json()["results"] -``` - - -Do this to enable async loading - -```py -@dlt.resource(write_disposition="replace") -def pokemon(pokemon_url: str) -> Iterable[TDataItem]: - """ - Returns a list of pokemon. - Yields: - dict: The pokemon data. - """ - yield requests.get(pokemon_url).json()["results"] -``` - \ No newline at end of file diff --git a/docs/website/docusaurus.config.js b/docs/website/docusaurus.config.js index c497d98261..31f91d6feb 100644 --- a/docs/website/docusaurus.config.js +++ b/docs/website/docusaurus.config.js @@ -84,13 +84,6 @@ const config = { label: 'Docs', }, { to: 'blog', label: 'Blog', position: 'left' }, - { - type: 'doc', - docId: 'examples', - label: 'Code Examples', - position:'right', - className: 'examples-link', - }, { href: 'https://join.slack.com/t/dlthub-community/shared_invite/zt-1slox199h-HAE7EQoXmstkP_bTqal65g', label: '.', diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index ef4bbf7e10..90096669ec 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -230,7 +230,7 @@ const sidebars = { keywords: ['examples'], }, items: [ - 'examples/transformers_and_parallelism/index', + 'examples/transformers/index', 'examples/incremental_loading/index', ], },