-
Notifications
You must be signed in to change notification settings - Fork 186
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #619 from dlt-hub/rfix/updates-performance-docs
updates performance docs
- Loading branch information
Showing
19 changed files
with
474 additions
and
52 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# [extract] | ||
# max_parallel_items=1 | ||
|
||
[sources.extract] | ||
max_parallel_items=1 | ||
|
||
# [sources.performance_parallel_extract.extract] | ||
# workers=2 | ||
|
||
[sources.performance_parallel_extract.get_details.extract] | ||
workers=2 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# [sources.data_writer] | ||
# file_max_items=20000 | ||
|
||
# pipeline name is default source name when loading resources | ||
[sources.parallel_load.data_writer] | ||
file_max_items=100000 | ||
|
||
[normalize] | ||
workers=3 | ||
|
||
[normalize.data_writer] | ||
disable_compression=false | ||
file_max_items=100000 | ||
|
||
[load] | ||
workers=11 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import os | ||
os.environ["DLT_PROJECT_DIR"] = os.path.dirname(__file__) | ||
# @@@SNIPSTART parallel_config_example | ||
import os | ||
import dlt | ||
from itertools import islice | ||
from dlt.common import pendulum | ||
|
||
@dlt.resource(name="table") | ||
def read_table(limit): | ||
rows = iter(range(limit)) | ||
while item_slice := list(islice(rows, 1000)): | ||
now = pendulum.now().isoformat() | ||
yield [{"row": _id, "description": "this is row with id {_id}", "timestamp": now} for _id in item_slice] | ||
|
||
|
||
# this prevents process pool to run the initialization code again | ||
if __name__ == "__main__" or "PYTEST_CURRENT_TEST" in os.environ: | ||
pipeline = dlt.pipeline("parallel_load", destination="duckdb", full_refresh=True) | ||
pipeline.extract(read_table(1000000)) | ||
# we should have 11 files (10 pieces for `table` and 1 for state) | ||
extracted_files = pipeline.list_extracted_resources() | ||
print(extracted_files) | ||
# normalize and print counts | ||
print(pipeline.normalize(loader_file_format="jsonl")) | ||
# print jobs in load package (10 + 1 as above) | ||
load_id = pipeline.list_normalized_load_packages()[0] | ||
print(pipeline.get_load_package_info(load_id)) | ||
print(pipeline.load()) | ||
# @@@SNIPEND | ||
|
||
assert len(extracted_files) == 11 | ||
loaded_package = pipeline.get_load_package_info(load_id) | ||
assert len(loaded_package.jobs["completed_jobs"]) == 11 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import os | ||
os.environ["DLT_PROJECT_DIR"] = os.path.dirname(__file__) | ||
# @@@SNIPSTART performance_chunking | ||
import dlt | ||
|
||
def get_rows(limit): | ||
yield from map(lambda n: {"row": n}, range(limit)) | ||
|
||
@dlt.resource | ||
def database_cursor(): | ||
# here we yield each row returned from database separately | ||
yield from get_rows(10000) | ||
# @@@SNIPEND | ||
# @@@SNIPSTART performance_chunking_chunk | ||
from itertools import islice | ||
|
||
@dlt.resource | ||
def database_cursor_chunked(): | ||
# here we yield chunks of size 1000 | ||
rows = get_rows(10000) | ||
while item_slice := list(islice(rows, 1000)): | ||
print(f"got chunk of length {len(item_slice)}") | ||
yield item_slice | ||
# @@@SNIPEND | ||
|
||
assert len(list(database_cursor())) == 10000 | ||
assert len(list(database_cursor_chunked())) == 10000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import os | ||
os.environ["DLT_PROJECT_DIR"] = os.path.dirname(__file__) | ||
# @@@SNIPSTART parallel_extract_callables | ||
import dlt | ||
from time import sleep | ||
from threading import currentThread | ||
|
||
@dlt.resource | ||
def list_items(start, limit): | ||
yield from range(start, start + limit) | ||
|
||
@dlt.transformer | ||
@dlt.defer | ||
def get_details(item_id): | ||
# simulate a slow REST API where you wait 0.3 sec for each item | ||
sleep(0.3) | ||
print(f"item_id {item_id} in thread {currentThread().name}") | ||
# just return the results, if you yield, generator will be evaluated in main thread | ||
return {"row": item_id} | ||
|
||
|
||
# evaluate the pipeline and print all the items | ||
# resources are iterators and they are evaluated in the same way in the pipeline.run | ||
print(list(list_items(0, 10) | get_details)) | ||
# @@@SNIPEND | ||
# @@@SNIPSTART parallel_extract_awaitables | ||
import asyncio | ||
|
||
@dlt.transformer | ||
async def a_get_details(item_id): | ||
# simulate a slow REST API where you wait 0.3 sec for each item | ||
await asyncio.sleep(0.3) | ||
print(f"item_id {item_id} in thread {currentThread().name}") | ||
# just return the results, if you yield, generator will be evaluated in main thread | ||
return {"row": item_id} | ||
|
||
|
||
print(list(list_items(0, 10) | a_get_details)) | ||
# @@@SNIPEND |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import os | ||
import pytest | ||
|
||
from tests.pipeline.utils import assert_load_info | ||
from docs.snippets.utils import run_snippet, list_snippets | ||
|
||
# we do not want github events to run because it consumes too much free github credits | ||
RUN_SNIPPETS = list_snippets("reference") + ["parallel_load/parallel_load.py"] | ||
|
||
|
||
# @pytest.mark.parametrize("snippet_name", RUN_SNIPPETS) | ||
# def test_snippet(snippet_name: str) -> None: | ||
# run_snippet(os.path.join("reference", snippet_name)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.