From 4a5370c56fc40bdda582bb03df977f019fffdd09 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 8 Feb 2024 19:50:17 -0500 Subject: [PATCH 01/27] Refactoring of pipe thread pool and reduce polling --- dlt/extract/concurrency.py | 219 ++++++++++++++++++++++++++++ dlt/extract/pipe.py | 289 +++++++++++++------------------------ dlt/extract/typing.py | 8 + 3 files changed, 325 insertions(+), 191 deletions(-) create mode 100644 dlt/extract/concurrency.py diff --git a/dlt/extract/concurrency.py b/dlt/extract/concurrency.py new file mode 100644 index 0000000000..a71b2179d8 --- /dev/null +++ b/dlt/extract/concurrency.py @@ -0,0 +1,219 @@ +import asyncio +from uuid import uuid4 +from asyncio import Future +from concurrent.futures import ( + ThreadPoolExecutor, + as_completed, + wait as wait_for_futures, + FIRST_COMPLETED, +) +from threading import Thread +from typing import List, Awaitable, Callable, Any, Dict, Set, Optional + +from dlt.common.exceptions import PipelineException +from dlt.common.configuration.container import Container +from dlt.common.runtime.signals import sleep +from dlt.extract.typing import DataItemWithMeta, TItemFuture +from dlt.extract.items import ResolvablePipeItem, FuturePipeItem + +from dlt.extract.exceptions import ( + DltSourceException, + ExtractorException, + PipeException, + ResourceExtractionError, +) + + +class WorkerPool: + """Worker pool for pipe items that can be resolved asynchronously. + + Items can be either asyncio coroutines or regular callables which will be executed in a thread pool. + """ + + def __init__( + self, workers: int = 5, poll_interval: float = 0.01, max_parallel_items: int = 20 + ) -> None: + self.futures: Dict[TItemFuture, FuturePipeItem] = {} + self._thread_pool: ThreadPoolExecutor = None + self._async_pool: asyncio.AbstractEventLoop = None + self._async_pool_thread: Thread = None + self.workers = workers + self.poll_interval = poll_interval + self.max_parallel_items = max_parallel_items + self.used_slots: int = 0 + + def __len__(self) -> int: + return len(self.futures) + + @property + def free_slots(self) -> int: + # Done futures don't count as slots, so we can still add futures + return self.max_parallel_items - self.used_slots + + @property + def empty(self) -> bool: + return len(self.futures) == 0 + + def _ensure_thread_pool(self) -> ThreadPoolExecutor: + # lazily start or return thread pool + if self._thread_pool: + return self._thread_pool + + self._thread_pool = ThreadPoolExecutor( + self.workers, thread_name_prefix=Container.thread_pool_prefix() + "threads" + ) + return self._thread_pool + + def _ensure_async_pool(self) -> asyncio.AbstractEventLoop: + # lazily create async pool is separate thread + if self._async_pool: + return self._async_pool + + def start_background_loop(loop: asyncio.AbstractEventLoop) -> None: + asyncio.set_event_loop(loop) + loop.run_forever() + + self._async_pool = asyncio.new_event_loop() + self._async_pool_thread = Thread( + target=start_background_loop, + args=(self._async_pool,), + daemon=True, + name=Container.thread_pool_prefix() + "futures", + ) + self._async_pool_thread.start() + + # start or return async pool + return self._async_pool + + def _vacate_slot(self, _: TItemFuture) -> None: + # Used as callback to free up slot when future is done + self.used_slots -= 1 + + def submit(self, pipe_item: ResolvablePipeItem) -> Optional[TItemFuture]: + """Submit an item to the pool. + + Args: + pipe_item: The item to submit. + + Returns: + The future if the item was successfully submitted, otherwise None. + """ + if self.free_slots == 0: + return None + + if self.free_slots < 0: # TODO: Sanity check during dev, should never happen + raise RuntimeError("Used more slots than max_parallel_items") + + # submit to thread pool or async pool + item = pipe_item.item + if isinstance(item, Awaitable): + future = asyncio.run_coroutine_threadsafe(item, self._ensure_async_pool()) + elif callable(item): + future = self._ensure_thread_pool().submit(item) + else: + raise ValueError(f"Unsupported item type: {type(item)}") + + # Future is not removed from self.futures until it's been consumed by the + # pipe iterator. But we always want to vacate a slot so new jobs can be submitted + future.add_done_callback(self._vacate_slot) + self.used_slots += 1 + + self.futures[future] = FuturePipeItem( + future, pipe_item.step, pipe_item.pipe, pipe_item.meta + ) + return future + + def poll(self) -> None: + sleep(self.poll_interval) + + def _resolve_future(self, future: TItemFuture) -> Optional[ResolvablePipeItem]: + future, step, pipe, meta = self.futures.pop(future) + + if ex := future.exception(): + if isinstance(ex, StopAsyncIteration): + return None + # Raise if any future fails + if isinstance( + ex, (PipelineException, ExtractorException, DltSourceException, PipeException) + ): + raise ex + raise ResourceExtractionError(pipe.name, future, str(ex), "future") from ex + + item = future.result() + + if item is None: + return None + elif isinstance(item, DataItemWithMeta): + return ResolvablePipeItem(item.data, step, pipe, item.meta) + else: + return ResolvablePipeItem(item, step, pipe, meta) + + def resolve_next_future(self) -> Optional[ResolvablePipeItem]: + """Wait for the next future to be done. Returns None if no futures done.""" + if not self.futures: + return None + + if (item := self.resolve_next_future_no_wait()) is not None: + return item + for future in as_completed(self.futures): + if future.cancelled(): + # Get the next not-cancelled future + continue + + return self._resolve_future(future) + + return None + + def resolve_next_future_no_wait(self) -> Optional[ResolvablePipeItem]: + """Resolve the first done future in the pool. + This does not block and returns None if no future is done. + """ + # Get next done future + future = next((fut for fut in self.futures if fut.done() and not fut.cancelled()), None) + if not future: + return None + + return self._resolve_future(future) + + def wait_for_free_slot(self) -> None: + """Wait until any future in the pool is completed to ensure there's a free slot.""" + if self.free_slots >= 1: + self.poll() # Just sleep a bit to give other threads a chance + return + + for future in as_completed(self.futures): + if future.cancelled(): + # Get the next not-cancelled future + continue + if self.free_slots == 0: + # Future was already completed so slot was not freed + continue + return # Return when first future completes + + def close(self) -> None: + # Cancel all futures + for f in self.futures: + if not f.done(): + f.cancel() + + def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: + loop.stop() + + if self._async_pool: + # wait for all async generators to be closed + future = asyncio.run_coroutine_threadsafe( + self._async_pool.shutdown_asyncgens(), self._ensure_async_pool() + ) + + wait_for_futures([future]) + self._async_pool.call_soon_threadsafe(stop_background_loop, self._async_pool) + + self._async_pool_thread.join() + self._async_pool = None + self._async_pool_thread = None + + if self._thread_pool: + self._thread_pool.shutdown(wait=True) + self._thread_pool = None + + self.futures.clear() diff --git a/dlt/extract/pipe.py b/dlt/extract/pipe.py index 3062ed083d..67e6ea3cc5 100644 --- a/dlt/extract/pipe.py +++ b/dlt/extract/pipe.py @@ -24,6 +24,7 @@ TYPE_CHECKING, Literal, ) +from weakref import WeakKeyDictionary from dlt.common import sleep from dlt.common.configuration import configspec @@ -49,7 +50,13 @@ PipeNotBoundToData, ResourceExtractionError, ) -from dlt.extract.typing import DataItemWithMeta, ItemTransform, SupportsPipe, TPipedDataItems +from dlt.extract.typing import ( + DataItemWithMeta, + ItemTransform, + SupportsPipe, + TPipedDataItems, + TItemFuture, +) from dlt.extract.utils import ( check_compat_transformer, simulate_func_call, @@ -57,41 +64,8 @@ wrap_resource_gen, wrap_async_iterator, ) - -if TYPE_CHECKING: - TItemFuture = Future[Union[TDataItems, DataItemWithMeta]] -else: - TItemFuture = Future - - -class PipeItem(NamedTuple): - item: TDataItems - step: int - pipe: "Pipe" - meta: Any - - -class ResolvablePipeItem(NamedTuple): - # mypy unable to handle recursive types, ResolvablePipeItem should take itself in "item" - item: Union[TPipedDataItems, Iterator[TPipedDataItems]] - step: int - pipe: "Pipe" - meta: Any - - -class FuturePipeItem(NamedTuple): - item: TItemFuture - step: int - pipe: "Pipe" - meta: Any - - -class SourcePipeItem(NamedTuple): - item: Union[Iterator[TPipedDataItems], Iterator[ResolvablePipeItem]] - step: int - pipe: "Pipe" - meta: Any - +from dlt.extract.concurrency import WorkerPool +from dlt.extract.items import PipeItem, ResolvablePipeItem, SourcePipeItem, FuturePipeItem # pipeline step may be iterator of data items or mapping function that returns data item or another iterator from dlt.common.typing import TDataItem @@ -507,17 +481,22 @@ def __init__( sources: List[SourcePipeItem], next_item_mode: TPipeNextItemMode, ) -> None: - self.max_parallel_items = max_parallel_items - self.workers = workers - self.futures_poll_interval = futures_poll_interval - self._async_pool: asyncio.AbstractEventLoop = None - self._async_pool_thread: Thread = None - self._thread_pool: ThreadPoolExecutor = None + # self.max_parallel_items = max_parallel_items + # self.workers = workers + # self.futures_poll_interval = futures_poll_interval + # self._async_pool: asyncio.AbstractEventLoop = None + # self._async_pool_thread: Thread = None + # self._thread_pool: ThreadPoolExecutor = None self._sources = sources - self._futures: List[FuturePipeItem] = [] + # self._futures: List[FuturePipeItem] = [] self._next_item_mode: TPipeNextItemMode = next_item_mode self._initial_sources_count = len(sources) self._current_source_index: int = 0 + self._worker_pool = WorkerPool( + workers=workers, + poll_interval=futures_poll_interval, + max_parallel_items=max_parallel_items, + ) @classmethod @with_config(spec=PipeIteratorConfiguration) @@ -597,19 +576,22 @@ def __next__(self) -> PipeItem: while True: # do we need new item? if pipe_item is None: - # process element from the futures - if len(self._futures) > 0: - pipe_item = self._resolve_futures() + # process element from the futures pool if one is ready + # if len(self._worker_pool) > 0: + pipe_item = self._worker_pool.resolve_next_future_no_wait() # if none then take element from the newest source if pipe_item is None: pipe_item = self._get_source_item() if pipe_item is None: - if len(self._futures) == 0 and len(self._sources) == 0: + if self._worker_pool.empty and len(self._sources) == 0: # no more elements in futures or sources raise StopIteration() else: - sleep(self.futures_poll_interval) + # wait for some future to complete + pipe_item = self._worker_pool.resolve_next_future() + + if pipe_item is None: continue item = pipe_item.item @@ -633,22 +615,16 @@ def __next__(self) -> PipeItem: continue if isinstance(item, Awaitable) or callable(item): - # do we have a free slot or one of the slots is done? - if len(self._futures) < self.max_parallel_items or self._next_future() >= 0: - # check if Awaitable first - awaitable can also be a callable - if isinstance(item, Awaitable): - future = asyncio.run_coroutine_threadsafe(item, self._ensure_async_pool()) - elif callable(item): - future = self._ensure_thread_pool().submit(item) - # print(future) - self._futures.append(FuturePipeItem(future, pipe_item.step, pipe_item.pipe, pipe_item.meta)) # type: ignore - # pipe item consumed for now, request a new one - pipe_item = None - continue - else: - # print("maximum futures exceeded, waiting") - sleep(self.futures_poll_interval) - # try same item later + # TODO: Duplicated from logic in get_source_item + # The item is a callable that runs in the futures pool + future: Optional[TItemFuture] = None + while future is None: + future = self._worker_pool.submit(pipe_item) # type: ignore[arg-type] + if future is None: + # worker pool is full, wait until a slot becomes free + self._worker_pool.wait_for_free_slot() + else: + pipe_item = None continue # if we are at the end of the pipe then yield element @@ -695,122 +671,6 @@ def __next__(self) -> PipeItem: else: pipe_item = None - def close(self) -> None: - # unregister the pipe name right after execution of gen stopped - unset_current_pipe_name() - - def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: - loop.stop() - - # close all generators - for gen, _, _, _ in self._sources: - if inspect.isgenerator(gen): - gen.close() - self._sources.clear() - - # stop all futures - for f, _, _, _ in self._futures: - if not f.done(): - f.cancel() - - # let tasks cancel - if self._async_pool: - # wait for all async generators to be closed - future = asyncio.run_coroutine_threadsafe( - self._async_pool.shutdown_asyncgens(), self._ensure_async_pool() - ) - while not future.done(): - sleep(self.futures_poll_interval) - self._async_pool.call_soon_threadsafe(stop_background_loop, self._async_pool) - # print("joining thread") - self._async_pool_thread.join() - self._async_pool = None - self._async_pool_thread = None - if self._thread_pool: - self._thread_pool.shutdown(wait=True) - self._thread_pool = None - - self._futures.clear() - - def _ensure_async_pool(self) -> asyncio.AbstractEventLoop: - # lazily create async pool is separate thread - if self._async_pool: - return self._async_pool - - def start_background_loop(loop: asyncio.AbstractEventLoop) -> None: - asyncio.set_event_loop(loop) - loop.run_forever() - - self._async_pool = asyncio.new_event_loop() - self._async_pool_thread = Thread( - target=start_background_loop, - args=(self._async_pool,), - daemon=True, - name=Container.thread_pool_prefix() + "futures", - ) - self._async_pool_thread.start() - - # start or return async pool - return self._async_pool - - def _ensure_thread_pool(self) -> ThreadPoolExecutor: - # lazily start or return thread pool - if self._thread_pool: - return self._thread_pool - - self._thread_pool = ThreadPoolExecutor( - self.workers, thread_name_prefix=Container.thread_pool_prefix() + "threads" - ) - return self._thread_pool - - def __enter__(self) -> "PipeIterator": - return self - - def __exit__( - self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: types.TracebackType - ) -> None: - self.close() - - def _next_future(self) -> int: - return next((i for i, val in enumerate(self._futures) if val.item.done()), -1) - - def _resolve_futures(self) -> ResolvablePipeItem: - # no futures at all - if len(self._futures) == 0: - return None - - # anything done? - idx = self._next_future() - if idx == -1: - # nothing done - return None - - future, step, pipe, meta = self._futures.pop(idx) - - if future.cancelled(): - # get next future - return self._resolve_futures() - - if future.exception(): - ex = future.exception() - if isinstance(ex, StopAsyncIteration): - return None - if isinstance( - ex, (PipelineException, ExtractorException, DltSourceException, PipeException) - ): - raise ex - raise ResourceExtractionError(pipe.name, future, str(ex), "future") from ex - - item = future.result() - - # we also interpret future items that are None to not be value to be consumed - if item is None: - return None - elif isinstance(item, DataItemWithMeta): - return ResolvablePipeItem(item.data, step, pipe, item.meta) - else: - return ResolvablePipeItem(item, step, pipe, meta) - def _get_source_item(self) -> ResolvablePipeItem: sources_count = len(self._sources) # no more sources to iterate @@ -822,34 +682,58 @@ def _get_source_item(self) -> ResolvablePipeItem: # if too many new sources is added we switch to fifo not to exhaust them if ( self._next_item_mode == "fifo" - or (sources_count - self._initial_sources_count) >= self.max_parallel_items + or sources_count - self._initial_sources_count + >= self._worker_pool.max_parallel_items ): self._current_source_index = sources_count - 1 else: self._current_source_index = (self._current_source_index - 1) % sources_count while True: - # if we have checked all sources once and all returned None, then we can sleep a bit + # if we have checked all sources once and all returned None, then lets sleep a bit or wait for a free worker slot if self._current_source_index == first_evaluated_index: - sleep(self.futures_poll_interval) + self._worker_pool.wait_for_free_slot() # get next item from the current source + # gen, step, pipe, meta = self._sources[self._current_source_index] gen, step, pipe, meta = self._sources[self._current_source_index] set_current_pipe_name(pipe.name) - if (item := next(gen)) is not None: + + pipe_item = next(gen) + if pipe_item is not None: # full pipe item may be returned, this is used by ForkPipe step # to redirect execution of an item to another pipe - if isinstance(item, ResolvablePipeItem): - return item - else: + # else + if not isinstance(pipe_item, ResolvablePipeItem): # keep the item assigned step and pipe when creating resolvable item - if isinstance(item, DataItemWithMeta): - return ResolvablePipeItem(item.data, step, pipe, item.meta) + if isinstance(pipe_item, DataItemWithMeta): + pipe_item = ResolvablePipeItem( + pipe_item.data, step, pipe, pipe_item.meta + ) else: - return ResolvablePipeItem(item, step, pipe, meta) + pipe_item = ResolvablePipeItem(pipe_item, step, pipe, meta) + + if isinstance(pipe_item.item, Awaitable) or callable(pipe_item.item): + # The item is a callable that runs in the futures pool + future: Optional[TItemFuture] = None + while future is None: + future = self._worker_pool.submit(pipe_item) + if future is None: + # worker pool is full, wait until a slot becomes free + self._worker_pool.wait_for_free_slot() + pipe_item = None + if len(self._worker_pool) >= sources_count: + # Return here so we're not collecting done futures forever + return None + # Otherwhise we continue to the next source + + if pipe_item is not None: + return pipe_item + # remember the first evaluated index if first_evaluated_index is None: first_evaluated_index = self._current_source_index - # always go round robin if None was returned + # always go round robin if None was returned or item is to be run as future self._current_source_index = (self._current_source_index - 1) % sources_count + except StopIteration: # remove empty iterator and try another source self._sources.pop(self._current_source_index) @@ -862,6 +746,29 @@ def _get_source_item(self) -> ResolvablePipeItem: except Exception as ex: raise ResourceExtractionError(pipe.name, gen, str(ex), "generator") from ex + def close(self) -> None: + # unregister the pipe name right after execution of gen stopped + unset_current_pipe_name() + + def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: + loop.stop() + + # close all generators + for gen, _, _, _ in self._sources: + if inspect.isgenerator(gen): + gen.close() + self._sources.clear() + + self._worker_pool.close() + + def __enter__(self) -> "PipeIterator": + return self + + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: types.TracebackType + ) -> None: + self.close() + @staticmethod def clone_pipes( pipes: Sequence[Pipe], existing_cloned_pairs: Dict[int, Pipe] = None diff --git a/dlt/extract/typing.py b/dlt/extract/typing.py index e0096a255f..a91a91f7ee 100644 --- a/dlt/extract/typing.py +++ b/dlt/extract/typing.py @@ -11,7 +11,9 @@ TypeVar, Union, Awaitable, + TYPE_CHECKING, ) +from concurrent.futures import Future from dlt.common.typing import TAny, TDataItem, TDataItems @@ -158,3 +160,9 @@ class ValidateItem(ItemTransform[TDataItem]): def bind(self, pipe: SupportsPipe) -> ItemTransform[TDataItem]: self.table_name = pipe.name return self + + +if TYPE_CHECKING: + TItemFuture = Future[Union[TDataItems, DataItemWithMeta]] +else: + TItemFuture = Future From b59a59e2405ca6efa8f16b0393c1e35ef4860228 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 13 Feb 2024 16:15:26 -0500 Subject: [PATCH 02/27] Decorator --- dlt/extract/decorators.py | 75 ++++++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 12 deletions(-) diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index d86fd04ef4..0018fef070 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -77,7 +77,8 @@ class SourceSchemaInjectableContext(ContainerInjectableContext): if TYPE_CHECKING: - def __init__(self, schema: Schema = None) -> None: ... + def __init__(self, schema: Schema = None) -> None: + ... @configspec @@ -90,7 +91,8 @@ class SourceInjectableContext(ContainerInjectableContext): if TYPE_CHECKING: - def __init__(self, source: DltSource = None) -> None: ... + def __init__(self, source: DltSource = None) -> None: + ... TSourceFunParams = ParamSpec("TSourceFunParams") @@ -110,7 +112,8 @@ def source( schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource, # type: ignore[assignment] -) -> Callable[TSourceFunParams, DltSource]: ... +) -> Callable[TSourceFunParams, DltSource]: + ... @overload @@ -125,7 +128,8 @@ def source( schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource, # type: ignore[assignment] -) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, TDltSourceImpl]]: ... +) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, TDltSourceImpl]]: + ... def source( @@ -273,7 +277,8 @@ def resource( table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, -) -> DltResource: ... +) -> DltResource: + ... @overload @@ -290,7 +295,8 @@ def resource( table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, -) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: ... +) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: + ... @overload @@ -308,7 +314,8 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: Literal[True] = True, -) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, DltResource]]: ... +) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, DltResource]]: + ... @overload @@ -325,7 +332,8 @@ def resource( table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, -) -> DltResource: ... +) -> DltResource: + ... def resource( @@ -343,6 +351,7 @@ def resource( spec: Type[BaseConfiguration] = None, standalone: bool = False, data_from: TUnboundDltResource = None, + parallelize: bool = False, ) -> Any: """When used as a decorator, transforms any generator (yielding) function into a `dlt resource`. When used as a function, it transforms data in `data` argument into a `dlt resource`. @@ -442,6 +451,9 @@ def decorator( if not standalone and callable(name): raise DynamicNameNotStandaloneResource(get_callable_name(f)) + if parallelize: + f = parallel(f) + # resource_section = name if name and not callable(name) else get_callable_name(f) resource_name = name if name and not callable(name) else get_callable_name(f) @@ -537,7 +549,8 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, -) -> Callable[[Callable[Concatenate[TDataItem, TResourceFunParams], Any]], DltResource]: ... +) -> Callable[[Callable[Concatenate[TDataItem, TResourceFunParams], Any]], DltResource]: + ... @overload @@ -557,7 +570,8 @@ def transformer( ) -> Callable[ [Callable[Concatenate[TDataItem, TResourceFunParams], Any]], Callable[TResourceFunParams, DltResource], -]: ... +]: + ... @overload @@ -573,7 +587,8 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, -) -> DltResource: ... +) -> DltResource: + ... @overload @@ -590,7 +605,8 @@ def transformer( selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: Literal[True] = True, -) -> Callable[TResourceFunParams, DltResource]: ... +) -> Callable[TResourceFunParams, DltResource]: + ... def transformer( @@ -736,3 +752,38 @@ def _curry() -> TBoundItems: return _curry return _wrap + + +def parallel(f: Callable[TResourceFunParams, Any]) -> Callable[TResourceFunParams, Any]: + @wraps(f) + def _wrap(*args, **kwargs): + gen = f(*args, **kwargs) + if inspect.isfunction(gen): + gen = gen() + if inspect.isasyncgen(gen): + raise ValueError("Async generators are not supported with paralellize") + + exhausted = False + busy = False + + def _parallel_gen(): + try: + return next(gen) + except StopIteration: + nonlocal exhausted + exhausted = True + return None + finally: + nonlocal busy + busy = False + + while not exhausted: + try: + while busy: + yield None + busy = True + yield _parallel_gen + except GeneratorExit: + gen.close() + + return _wrap From 0d51fdc92e99419066a0a7e86a84b18d87452518 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 13 Feb 2024 17:48:58 -0500 Subject: [PATCH 03/27] Tests for parallel --- dlt/extract/decorators.py | 9 +- dlt/extract/resource.py | 4 +- tests/pipeline/test_resources_evaluation.py | 207 +++++++++++--------- 3 files changed, 119 insertions(+), 101 deletions(-) diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 0018fef070..fc0b9cef09 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -277,6 +277,7 @@ def resource( table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, + parallelized: bool = False, ) -> DltResource: ... @@ -295,6 +296,7 @@ def resource( table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, + parallelized: bool = False, ) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: ... @@ -314,6 +316,7 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: Literal[True] = True, + parallelized: bool = False, ) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, DltResource]]: ... @@ -332,6 +335,7 @@ def resource( table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, + parallelized: bool = False, ) -> DltResource: ... @@ -351,7 +355,7 @@ def resource( spec: Type[BaseConfiguration] = None, standalone: bool = False, data_from: TUnboundDltResource = None, - parallelize: bool = False, + parallelized: bool = False, ) -> Any: """When used as a decorator, transforms any generator (yielding) function into a `dlt resource`. When used as a function, it transforms data in `data` argument into a `dlt resource`. @@ -451,7 +455,7 @@ def decorator( if not standalone and callable(name): raise DynamicNameNotStandaloneResource(get_callable_name(f)) - if parallelize: + if parallelized: f = parallel(f) # resource_section = name if name and not callable(name) else get_callable_name(f) @@ -785,5 +789,6 @@ def _parallel_gen(): yield _parallel_gen except GeneratorExit: gen.close() + raise return _wrap diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 3d03486436..72723e60ee 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -314,14 +314,12 @@ def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 def _gen_wrap(gen: TPipeStep) -> TPipeStep: """Wrap a generator to take the first `max_items` records""" count = 0 - is_async_gen = False if inspect.isfunction(gen): gen = gen() # wrap async gen already here if isinstance(gen, AsyncIterator): gen = wrap_async_iterator(gen) - is_async_gen = True try: for i in gen: # type: ignore # TODO: help me fix this later @@ -331,7 +329,7 @@ def _gen_wrap(gen: TPipeStep) -> TPipeStep: # async gen yields awaitable so we must count one awaitable more # so the previous one is evaluated and yielded. # new awaitable will be cancelled - if count == max_items + int(is_async_gen): + if count == max_items: return finally: if inspect.isgenerator(gen): diff --git a/tests/pipeline/test_resources_evaluation.py b/tests/pipeline/test_resources_evaluation.py index 7f0a7890a7..f3e2734a14 100644 --- a/tests/pipeline/test_resources_evaluation.py +++ b/tests/pipeline/test_resources_evaluation.py @@ -1,4 +1,7 @@ from typing import Any +import time +import threading +import random import dlt, asyncio, pytest, os @@ -212,99 +215,111 @@ async def async_resource1(): assert len(result) == 13 -# @pytest.mark.skip(reason="To be properly implemented in an upcoming PR") -# @pytest.mark.parametrize("parallelized", [True, False]) -# def test_async_decorator_experiment(parallelized) -> None: -# os.environ["EXTRACT__NEXT_ITEM_MODE"] = "fifo" -# execution_order = [] -# threads = set() - -# def parallelize(f) -> Any: -# """converts regular itarable to generator of functions that can be run in parallel in the pipe""" - -# @wraps(f) -# def _wrap(*args: Any, **kwargs: Any) -> Any: -# exhausted = False -# busy = False - -# gen = f(*args, **kwargs) -# # unpack generator -# if inspect.isfunction(gen): -# gen = gen() -# # if we have an async gen, no further action is needed -# if inspect.isasyncgen(gen): -# raise Exception("Already async gen") - -# # get next item from generator -# def _gen(): -# nonlocal exhausted -# # await asyncio.sleep(0.1) -# try: -# return next(gen) -# # on stop iteration mark as exhausted -# except StopIteration: -# exhausted = True -# return None -# finally: -# nonlocal busy -# busy = False - -# try: -# while not exhausted: -# while busy: -# yield None -# busy = True -# yield _gen -# except GeneratorExit: -# # clean up inner generator -# gen.close() - -# return _wrap - -# @parallelize -# def resource1(): -# for l_ in ["a", "b", "c"]: -# time.sleep(0.5) -# nonlocal execution_order -# execution_order.append("one") -# threads.add(threading.get_ident()) -# yield {"letter": l_} - -# @parallelize -# def resource2(): -# time.sleep(0.25) -# for l_ in ["e", "f", "g"]: -# time.sleep(0.5) -# nonlocal execution_order -# execution_order.append("two") -# threads.add(threading.get_ident()) -# yield {"letter": l_} - -# @dlt.source -# def source(): -# if parallelized: -# return [resource1(), resource2()] -# else: # return unwrapped resources -# return [resource1.__wrapped__(), resource2.__wrapped__()] - -# pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) -# pipeline_1.run(source()) - -# # all records should be here -# with pipeline_1.sql_client() as c: -# with c.execute_query("SELECT * FROM resource1") as cur: -# rows = list(cur.fetchall()) -# assert len(rows) == 3 -# assert {r[0] for r in rows} == {"a", "b", "c"} - -# with c.execute_query("SELECT * FROM resource2") as cur: -# rows = list(cur.fetchall()) -# assert len(rows) == 3 -# assert {r[0] for r in rows} == {"e", "f", "g"} - -# if parallelized: -# assert len(threads) > 1 -# assert execution_order == ["one", "two", "one", "two", "one", "two"] -# else: -# assert execution_order == ["one", "one", "one", "two", "two", "two"] -# assert len(threads) == 1 +@pytest.mark.parametrize("parallelized", [True, False]) +def test_parallelized_resource(parallelized: bool) -> None: + os.environ["EXTRACT__NEXT_ITEM_MODE"] = "fifo" + execution_order = [] + threads = set() + + @dlt.resource(parallelized=parallelized) + def resource1(): + for l_ in ["a", "b", "c"]: + time.sleep(0.5) + nonlocal execution_order + execution_order.append("one") + threads.add(threading.get_ident()) + yield {"letter": l_} + + @dlt.resource(parallelized=parallelized) + def resource2(): + time.sleep(0.25) + for l_ in ["e", "f", "g"]: + time.sleep(0.5) + nonlocal execution_order + execution_order.append("two") + threads.add(threading.get_ident()) + yield {"letter": l_} + + @dlt.source + def source(): + return [resource1(), resource2()] + + pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) + pipeline_1.run(source()) + + # all records should be here + with pipeline_1.sql_client() as c: + with c.execute_query("SELECT * FROM resource1") as cur: + rows = list(cur.fetchall()) + assert len(rows) == 3 + assert {r[0] for r in rows} == {"a", "b", "c"} + + with c.execute_query("SELECT * FROM resource2") as cur: + rows = list(cur.fetchall()) + assert len(rows) == 3 + assert {r[0] for r in rows} == {"e", "f", "g"} + + if parallelized: + assert len(threads) > 1 + assert execution_order == ["one", "two", "one", "two", "one", "two"] + else: + assert execution_order == ["one", "one", "one", "two", "two", "two"] + assert len(threads) == 1 + + +# Parametrize with different resource counts to excersize the worker pool: +# 1. More than number of workers +# 2. 1 resource only +# 3. Exact number of workers +# 4. More than future pool max size +# 5. Exact future pool max size +@pytest.mark.parametrize("n_resources", [8, 1, 5, 25, 20]) +def test_parallelized_resource_extract_order(n_resources: int) -> None: + os.environ["EXTRACT__NEXT_ITEM_MODE"] = "fifo" + + item_counts = [random.randrange(10, 30) for _ in range(n_resources)] + item_ranges = [] # Create numeric ranges that each resource will yield + # Use below to check the extraction order + for i, n_items in enumerate(item_counts): + if i == 0: + start_range = 0 + else: + start_range = sum(item_counts[:i]) + end_range = start_range + n_items + item_ranges.append(range(start_range, end_range)) + + def _sleep_duration() -> float: + # Sleep for random duration each yield + return random.uniform(0.005, 0.012) + + @dlt.source + def some_source(): + def some_data(resource_num: int): + for item in item_ranges[resource_num]: + print(f"RESOURCE {resource_num}") + # Sleep for a random duration each yield + time.sleep(random.uniform(0.005, 0.012)) + yield f"item-{item}" + print(f"RESOURCE {resource_num}:", item) + + for i in range(n_resources): + yield dlt.resource(some_data, name=f"some_data_{i}", parallelized=True)(i) + + source = some_source() + result = list(source) + result = [int(item.split("-")[1]) for item in result] + + assert len(result) == sum(item_counts) + + # Check extracted results from each resource + chunked_results = [] + start_range = 0 + for item_range in item_ranges: + chunked_results.append([item for item in result if item in item_range]) + + for i, chunk in enumerate(chunked_results): + # All items are included + assert len(chunk) == item_counts[i] + assert len(set(chunk)) == len(chunk) + # Items are extracted in order per resource + assert chunk == sorted(chunk) From 0c734732e995bc3d8283d6d18f5cf573ad46d35a Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 13 Feb 2024 17:50:31 -0500 Subject: [PATCH 04/27] Include items module --- dlt/extract/items.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 dlt/extract/items.py diff --git a/dlt/extract/items.py b/dlt/extract/items.py new file mode 100644 index 0000000000..43e8fda8d5 --- /dev/null +++ b/dlt/extract/items.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Iterator, NamedTuple, Union, Optional +from concurrent.futures import Future + +from dlt.common.typing import TDataItems +from dlt.extract.typing import TPipedDataItems, DataItemWithMeta, TItemFuture + +if TYPE_CHECKING: + from dlt.extract.pipe import Pipe + + +class PipeItem(NamedTuple): + item: TDataItems + step: int + pipe: "Pipe" + meta: Any + + +class ResolvablePipeItem(NamedTuple): + # mypy unable to handle recursive types, ResolvablePipeItem should take itself in "item" + item: Union[TPipedDataItems, Iterator[TPipedDataItems]] + step: int + pipe: "Pipe" + meta: Any + + +class FuturePipeItem(NamedTuple): + item: TItemFuture + step: int + pipe: "Pipe" + meta: Any + + +class SourcePipeItem(NamedTuple): + item: Union[Iterator[TPipedDataItems], Iterator[ResolvablePipeItem]] + step: int + pipe: "Pipe" + meta: Any From 4fe65078a350ef1c229a28767c3a55fd3809fada Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 13 Feb 2024 17:52:22 -0500 Subject: [PATCH 05/27] Names --- dlt/extract/decorators.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index fc0b9cef09..17734d1a1a 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -456,7 +456,7 @@ def decorator( raise DynamicNameNotStandaloneResource(get_callable_name(f)) if parallelized: - f = parallel(f) + f = parallelize(f) # resource_section = name if name and not callable(name) else get_callable_name(f) resource_name = name if name and not callable(name) else get_callable_name(f) @@ -758,9 +758,9 @@ def _curry() -> TBoundItems: return _wrap -def parallel(f: Callable[TResourceFunParams, Any]) -> Callable[TResourceFunParams, Any]: +def parallelize(f: Callable[TResourceFunParams, Any]) -> Callable[TResourceFunParams, Any]: @wraps(f) - def _wrap(*args, **kwargs): + def _wrap(*args: Any, **kwargs: Any) -> Any: # TODO: Type correctly gen = f(*args, **kwargs) if inspect.isfunction(gen): gen = gen() @@ -770,7 +770,7 @@ def _wrap(*args, **kwargs): exhausted = False busy = False - def _parallel_gen(): + def _parallel_gen() -> Any: # TODO: Type correctly try: return next(gen) except StopIteration: From fbe4d7f29195be861f7eb2724b03454811151cfc Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Wed, 14 Feb 2024 13:33:24 -0500 Subject: [PATCH 06/27] Separate modules for pipe and pipe_iterator --- dlt/extract/extract.py | 2 +- dlt/extract/pipe.py | 423 +---------------------------- dlt/extract/pipe_iterator.py | 409 ++++++++++++++++++++++++++++ dlt/extract/resource.py | 3 +- dlt/extract/source.py | 3 +- dlt/reflection/script_inspector.py | 2 +- tests/extract/test_extract_pipe.py | 3 +- 7 files changed, 420 insertions(+), 425 deletions(-) create mode 100644 dlt/extract/pipe_iterator.py diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index c1ff5da80b..cdd61403a7 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -33,7 +33,7 @@ from dlt.extract.decorators import SourceInjectableContext, SourceSchemaInjectableContext from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints -from dlt.extract.pipe import PipeIterator +from dlt.extract.pipe_iterator import PipeIterator from dlt.extract.source import DltSource from dlt.extract.resource import DltResource from dlt.extract.storage import ExtractStorage diff --git a/dlt/extract/pipe.py b/dlt/extract/pipe.py index 67e6ea3cc5..dd1f3e4ec5 100644 --- a/dlt/extract/pipe.py +++ b/dlt/extract/pipe.py @@ -1,61 +1,23 @@ import inspect -import types -import asyncio import makefun -from asyncio import Future -from concurrent.futures import ThreadPoolExecutor from copy import copy -from threading import Thread -from typing import ( - Any, - AsyncIterator, - Dict, - Optional, - Sequence, - Union, - Callable, - Iterable, - Iterator, - List, - NamedTuple, - Awaitable, - Tuple, - Type, - TYPE_CHECKING, - Literal, -) -from weakref import WeakKeyDictionary - -from dlt.common import sleep -from dlt.common.configuration import configspec -from dlt.common.configuration.inject import with_config -from dlt.common.configuration.specs import BaseConfiguration, ContainerInjectableContext -from dlt.common.configuration.container import Container -from dlt.common.exceptions import PipelineException -from dlt.common.source import unset_current_pipe_name, set_current_pipe_name +from typing import Any, AsyncIterator, Optional, Union, Callable, Iterable, Iterator, List, Tuple + from dlt.common.typing import AnyFun, AnyType, TDataItems from dlt.common.utils import get_callable_name from dlt.extract.exceptions import ( CreatePipeException, - DltSourceException, - ExtractorException, InvalidStepFunctionArguments, InvalidResourceDataTypeFunctionNotAGenerator, InvalidTransformerGeneratorFunction, ParametrizedResourceUnbound, - PipeException, - PipeGenInvalid, - PipeItemProcessingError, PipeNotBoundToData, - ResourceExtractionError, ) from dlt.extract.typing import ( - DataItemWithMeta, ItemTransform, SupportsPipe, TPipedDataItems, - TItemFuture, ) from dlt.extract.utils import ( check_compat_transformer, @@ -64,12 +26,9 @@ wrap_resource_gen, wrap_async_iterator, ) -from dlt.extract.concurrency import WorkerPool -from dlt.extract.items import PipeItem, ResolvablePipeItem, SourcePipeItem, FuturePipeItem +from dlt.extract.items import ResolvablePipeItem # pipeline step may be iterator of data items or mapping function that returns data item or another iterator -from dlt.common.typing import TDataItem - TPipeStep = Union[ Iterable[TPipedDataItems], Iterator[TPipedDataItems], @@ -83,8 +42,6 @@ Callable[[TDataItems], Iterator[ResolvablePipeItem]], ] -TPipeNextItemMode = Literal["fifo", "round_robin"] - class ForkPipe: def __init__(self, pipe: "Pipe", step: int = -1, copy_on_fork: bool = False) -> None: @@ -460,377 +417,3 @@ def __repr__(self) -> str: else: bound_str = "" return f"Pipe {self.name} [steps: {len(self._steps)}] at {id(self)}{bound_str}" - - -class PipeIterator(Iterator[PipeItem]): - @configspec - class PipeIteratorConfiguration(BaseConfiguration): - max_parallel_items: int = 20 - workers: int = 5 - futures_poll_interval: float = 0.01 - copy_on_fork: bool = False - next_item_mode: str = "fifo" - - __section__ = "extract" - - def __init__( - self, - max_parallel_items: int, - workers: int, - futures_poll_interval: float, - sources: List[SourcePipeItem], - next_item_mode: TPipeNextItemMode, - ) -> None: - # self.max_parallel_items = max_parallel_items - # self.workers = workers - # self.futures_poll_interval = futures_poll_interval - # self._async_pool: asyncio.AbstractEventLoop = None - # self._async_pool_thread: Thread = None - # self._thread_pool: ThreadPoolExecutor = None - self._sources = sources - # self._futures: List[FuturePipeItem] = [] - self._next_item_mode: TPipeNextItemMode = next_item_mode - self._initial_sources_count = len(sources) - self._current_source_index: int = 0 - self._worker_pool = WorkerPool( - workers=workers, - poll_interval=futures_poll_interval, - max_parallel_items=max_parallel_items, - ) - - @classmethod - @with_config(spec=PipeIteratorConfiguration) - def from_pipe( - cls, - pipe: Pipe, - *, - max_parallel_items: int = 20, - workers: int = 5, - futures_poll_interval: float = 0.01, - next_item_mode: TPipeNextItemMode = "fifo", - ) -> "PipeIterator": - # join all dependent pipes - if pipe.parent: - pipe = pipe.full_pipe() - # clone pipe to allow multiple iterations on pipe based on iterables/callables - pipe = pipe._clone() - # head must be iterator - pipe.evaluate_gen() - if not isinstance(pipe.gen, Iterator): - raise PipeGenInvalid(pipe.name, pipe.gen) - - # create extractor - sources = [SourcePipeItem(pipe.gen, 0, pipe, None)] - return cls(max_parallel_items, workers, futures_poll_interval, sources, next_item_mode) - - @classmethod - @with_config(spec=PipeIteratorConfiguration) - def from_pipes( - cls, - pipes: Sequence[Pipe], - yield_parents: bool = True, - *, - max_parallel_items: int = 20, - workers: int = 5, - futures_poll_interval: float = 0.01, - copy_on_fork: bool = False, - next_item_mode: TPipeNextItemMode = "fifo", - ) -> "PipeIterator": - # print(f"max_parallel_items: {max_parallel_items} workers: {workers}") - sources: List[SourcePipeItem] = [] - - # clone all pipes before iterating (recursively) as we will fork them (this add steps) and evaluate gens - pipes, _ = PipeIterator.clone_pipes(pipes) - - def _fork_pipeline(pipe: Pipe) -> None: - if pipe.parent: - # fork the parent pipe - pipe.evaluate_gen() - pipe.parent.fork(pipe, copy_on_fork=copy_on_fork) - # make the parent yield by sending a clone of item to itself with position at the end - if yield_parents and pipe.parent in pipes: - # fork is last step of the pipe so it will yield - pipe.parent.fork(pipe.parent, len(pipe.parent) - 1, copy_on_fork=copy_on_fork) - _fork_pipeline(pipe.parent) - else: - # head of independent pipe must be iterator - pipe.evaluate_gen() - if not isinstance(pipe.gen, Iterator): - raise PipeGenInvalid(pipe.name, pipe.gen) - # add every head as source only once - if not any(i.pipe == pipe for i in sources): - sources.append(SourcePipeItem(pipe.gen, 0, pipe, None)) - - # reverse pipes for current mode, as we start processing from the back - pipes.reverse() - for pipe in pipes: - _fork_pipeline(pipe) - - # create extractor - return cls(max_parallel_items, workers, futures_poll_interval, sources, next_item_mode) - - def __next__(self) -> PipeItem: - pipe_item: Union[ResolvablePipeItem, SourcePipeItem] = None - # __next__ should call itself to remove the `while` loop and continue clauses but that may lead to stack overflows: there's no tail recursion opt in python - # https://stackoverflow.com/questions/13591970/does-python-optimize-tail-recursion (see Y combinator on how it could be emulated) - while True: - # do we need new item? - if pipe_item is None: - # process element from the futures pool if one is ready - # if len(self._worker_pool) > 0: - pipe_item = self._worker_pool.resolve_next_future_no_wait() - # if none then take element from the newest source - if pipe_item is None: - pipe_item = self._get_source_item() - - if pipe_item is None: - if self._worker_pool.empty and len(self._sources) == 0: - # no more elements in futures or sources - raise StopIteration() - else: - # wait for some future to complete - pipe_item = self._worker_pool.resolve_next_future() - - if pipe_item is None: - continue - - item = pipe_item.item - # if item is iterator, then add it as a new source - if isinstance(item, Iterator): - # print(f"adding iterable {item}") - self._sources.append( - SourcePipeItem(item, pipe_item.step, pipe_item.pipe, pipe_item.meta) - ) - pipe_item = None - continue - - # handle async iterator items as new source - if isinstance(item, AsyncIterator): - self._sources.append( - SourcePipeItem( - wrap_async_iterator(item), pipe_item.step, pipe_item.pipe, pipe_item.meta - ), - ) - pipe_item = None - continue - - if isinstance(item, Awaitable) or callable(item): - # TODO: Duplicated from logic in get_source_item - # The item is a callable that runs in the futures pool - future: Optional[TItemFuture] = None - while future is None: - future = self._worker_pool.submit(pipe_item) # type: ignore[arg-type] - if future is None: - # worker pool is full, wait until a slot becomes free - self._worker_pool.wait_for_free_slot() - else: - pipe_item = None - continue - - # if we are at the end of the pipe then yield element - if pipe_item.step == len(pipe_item.pipe) - 1: - # must be resolved - if isinstance(item, (Iterator, Awaitable, AsyncIterator)) or callable(item): - raise PipeItemProcessingError( - pipe_item.pipe.name, - f"Pipe item at step {pipe_item.step} was not fully evaluated and is of type" - f" {type(pipe_item.item).__name__}. This is internal error or you are" - " yielding something weird from resources ie. functions or awaitables.", - ) - # mypy not able to figure out that item was resolved - return pipe_item # type: ignore - - # advance to next step - step = pipe_item.pipe[pipe_item.step + 1] - try: - set_current_pipe_name(pipe_item.pipe.name) - next_meta = pipe_item.meta - next_item = step(item, meta=pipe_item.meta) # type: ignore - if isinstance(next_item, DataItemWithMeta): - next_meta = next_item.meta - next_item = next_item.data - except TypeError as ty_ex: - assert callable(step) - raise InvalidStepFunctionArguments( - pipe_item.pipe.name, - get_callable_name(step), - inspect.signature(step), - str(ty_ex), - ) - except (PipelineException, ExtractorException, DltSourceException, PipeException): - raise - except Exception as ex: - raise ResourceExtractionError( - pipe_item.pipe.name, step, str(ex), "transform" - ) from ex - # create next pipe item if a value was returned. A None means that item was consumed/filtered out and should not be further processed - if next_item is not None: - pipe_item = ResolvablePipeItem( - next_item, pipe_item.step + 1, pipe_item.pipe, next_meta - ) - else: - pipe_item = None - - def _get_source_item(self) -> ResolvablePipeItem: - sources_count = len(self._sources) - # no more sources to iterate - if sources_count == 0: - return None - try: - first_evaluated_index: int = None - # always reset to end of list for fifo mode, also take into account that new sources can be added - # if too many new sources is added we switch to fifo not to exhaust them - if ( - self._next_item_mode == "fifo" - or sources_count - self._initial_sources_count - >= self._worker_pool.max_parallel_items - ): - self._current_source_index = sources_count - 1 - else: - self._current_source_index = (self._current_source_index - 1) % sources_count - while True: - # if we have checked all sources once and all returned None, then lets sleep a bit or wait for a free worker slot - if self._current_source_index == first_evaluated_index: - self._worker_pool.wait_for_free_slot() - # get next item from the current source - # gen, step, pipe, meta = self._sources[self._current_source_index] - gen, step, pipe, meta = self._sources[self._current_source_index] - set_current_pipe_name(pipe.name) - - pipe_item = next(gen) - if pipe_item is not None: - # full pipe item may be returned, this is used by ForkPipe step - # to redirect execution of an item to another pipe - # else - if not isinstance(pipe_item, ResolvablePipeItem): - # keep the item assigned step and pipe when creating resolvable item - if isinstance(pipe_item, DataItemWithMeta): - pipe_item = ResolvablePipeItem( - pipe_item.data, step, pipe, pipe_item.meta - ) - else: - pipe_item = ResolvablePipeItem(pipe_item, step, pipe, meta) - - if isinstance(pipe_item.item, Awaitable) or callable(pipe_item.item): - # The item is a callable that runs in the futures pool - future: Optional[TItemFuture] = None - while future is None: - future = self._worker_pool.submit(pipe_item) - if future is None: - # worker pool is full, wait until a slot becomes free - self._worker_pool.wait_for_free_slot() - pipe_item = None - if len(self._worker_pool) >= sources_count: - # Return here so we're not collecting done futures forever - return None - # Otherwhise we continue to the next source - - if pipe_item is not None: - return pipe_item - - # remember the first evaluated index - if first_evaluated_index is None: - first_evaluated_index = self._current_source_index - # always go round robin if None was returned or item is to be run as future - self._current_source_index = (self._current_source_index - 1) % sources_count - - except StopIteration: - # remove empty iterator and try another source - self._sources.pop(self._current_source_index) - # decrease initial source count if we popped an initial source - if self._current_source_index < self._initial_sources_count: - self._initial_sources_count -= 1 - return self._get_source_item() - except (PipelineException, ExtractorException, DltSourceException, PipeException): - raise - except Exception as ex: - raise ResourceExtractionError(pipe.name, gen, str(ex), "generator") from ex - - def close(self) -> None: - # unregister the pipe name right after execution of gen stopped - unset_current_pipe_name() - - def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: - loop.stop() - - # close all generators - for gen, _, _, _ in self._sources: - if inspect.isgenerator(gen): - gen.close() - self._sources.clear() - - self._worker_pool.close() - - def __enter__(self) -> "PipeIterator": - return self - - def __exit__( - self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: types.TracebackType - ) -> None: - self.close() - - @staticmethod - def clone_pipes( - pipes: Sequence[Pipe], existing_cloned_pairs: Dict[int, Pipe] = None - ) -> Tuple[List[Pipe], Dict[int, Pipe]]: - """This will clone pipes and fix the parent/dependent references""" - cloned_pipes = [p._clone() for p in pipes if id(p) not in (existing_cloned_pairs or {})] - cloned_pairs = {id(p): c for p, c in zip(pipes, cloned_pipes)} - if existing_cloned_pairs: - cloned_pairs.update(existing_cloned_pairs) - - for clone in cloned_pipes: - while True: - if not clone.parent: - break - # if already a clone - if clone.parent in cloned_pairs.values(): - break - # clone if parent pipe not yet cloned - parent_id = id(clone.parent) - if parent_id not in cloned_pairs: - # print("cloning:" + clone.parent.name) - cloned_pairs[parent_id] = clone.parent._clone() - # replace with clone - # print(f"replace depends on {clone.name} to {clone.parent.name}") - clone.parent = cloned_pairs[parent_id] - # recur with clone - clone = clone.parent - - return cloned_pipes, cloned_pairs - - -class ManagedPipeIterator(PipeIterator): - """A version of the pipe iterator that gets closed automatically on an exception in _next_""" - - _ctx: List[ContainerInjectableContext] = None - _container: Container = None - - def set_context(self, ctx: List[ContainerInjectableContext]) -> None: - """Sets list of injectable contexts that will be injected into Container for each call to __next__""" - self._ctx = ctx - self._container = Container() - - def __next__(self) -> PipeItem: - if self._ctx: - managers = [self._container.injectable_context(ctx) for ctx in self._ctx] - for manager in managers: - manager.__enter__() - try: - item = super().__next__() - except Exception as ex: - # release context manager - if self._ctx: - if isinstance(ex, StopIteration): - for manager in managers: - manager.__exit__(None, None, None) - else: - for manager in managers: - manager.__exit__(type(ex), ex, None) - # crash in next - self.close() - raise - if self._ctx: - for manager in managers: - manager.__exit__(None, None, None) - return item diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py new file mode 100644 index 0000000000..b494d45709 --- /dev/null +++ b/dlt/extract/pipe_iterator.py @@ -0,0 +1,409 @@ +import inspect +import types +import asyncio +from typing import ( + AsyncIterator, + Dict, + Optional, + Sequence, + Union, + Iterator, + List, + Awaitable, + Tuple, + Type, + Literal, +) + +from dlt.common.configuration import configspec +from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs import BaseConfiguration, ContainerInjectableContext +from dlt.common.configuration.container import Container +from dlt.common.exceptions import PipelineException +from dlt.common.source import unset_current_pipe_name, set_current_pipe_name +from dlt.common.utils import get_callable_name + +from dlt.extract.exceptions import ( + DltSourceException, + ExtractorException, + InvalidStepFunctionArguments, + PipeException, + PipeGenInvalid, + PipeItemProcessingError, + ResourceExtractionError, +) +from dlt.extract.pipe import Pipe +from dlt.extract.typing import DataItemWithMeta, TItemFuture +from dlt.extract.utils import wrap_async_iterator +from dlt.extract.concurrency import WorkerPool +from dlt.extract.items import PipeItem, ResolvablePipeItem, SourcePipeItem + + +TPipeNextItemMode = Literal["fifo", "round_robin"] + + +class PipeIterator(Iterator[PipeItem]): + @configspec + class PipeIteratorConfiguration(BaseConfiguration): + max_parallel_items: int = 20 + workers: int = 5 + futures_poll_interval: float = 0.01 + copy_on_fork: bool = False + next_item_mode: str = "fifo" + + __section__ = "extract" + + def __init__( + self, + max_parallel_items: int, + workers: int, + futures_poll_interval: float, + sources: List[SourcePipeItem], + next_item_mode: TPipeNextItemMode, + ) -> None: + self._sources = sources + self._next_item_mode: TPipeNextItemMode = next_item_mode + self._initial_sources_count = len(sources) + self._current_source_index: int = 0 + self._worker_pool = WorkerPool( + workers=workers, + poll_interval=futures_poll_interval, + max_parallel_items=max_parallel_items, + ) + + @classmethod + @with_config(spec=PipeIteratorConfiguration) + def from_pipe( + cls, + pipe: Pipe, + *, + max_parallel_items: int = 20, + workers: int = 5, + futures_poll_interval: float = 0.01, + next_item_mode: TPipeNextItemMode = "fifo", + ) -> "PipeIterator": + # join all dependent pipes + if pipe.parent: + pipe = pipe.full_pipe() + # clone pipe to allow multiple iterations on pipe based on iterables/callables + pipe = pipe._clone() + # head must be iterator + pipe.evaluate_gen() + if not isinstance(pipe.gen, Iterator): + raise PipeGenInvalid(pipe.name, pipe.gen) + + # create extractor + sources = [SourcePipeItem(pipe.gen, 0, pipe, None)] + return cls(max_parallel_items, workers, futures_poll_interval, sources, next_item_mode) + + @classmethod + @with_config(spec=PipeIteratorConfiguration) + def from_pipes( + cls, + pipes: Sequence[Pipe], + yield_parents: bool = True, + *, + max_parallel_items: int = 20, + workers: int = 5, + futures_poll_interval: float = 0.01, + copy_on_fork: bool = False, + next_item_mode: TPipeNextItemMode = "fifo", + ) -> "PipeIterator": + # print(f"max_parallel_items: {max_parallel_items} workers: {workers}") + sources: List[SourcePipeItem] = [] + + # clone all pipes before iterating (recursively) as we will fork them (this add steps) and evaluate gens + pipes, _ = PipeIterator.clone_pipes(pipes) + + def _fork_pipeline(pipe: Pipe) -> None: + if pipe.parent: + # fork the parent pipe + pipe.evaluate_gen() + pipe.parent.fork(pipe, copy_on_fork=copy_on_fork) + # make the parent yield by sending a clone of item to itself with position at the end + if yield_parents and pipe.parent in pipes: + # fork is last step of the pipe so it will yield + pipe.parent.fork(pipe.parent, len(pipe.parent) - 1, copy_on_fork=copy_on_fork) + _fork_pipeline(pipe.parent) + else: + # head of independent pipe must be iterator + pipe.evaluate_gen() + if not isinstance(pipe.gen, Iterator): + raise PipeGenInvalid(pipe.name, pipe.gen) + # add every head as source only once + if not any(i.pipe == pipe for i in sources): + sources.append(SourcePipeItem(pipe.gen, 0, pipe, None)) + + # reverse pipes for current mode, as we start processing from the back + pipes.reverse() + for pipe in pipes: + _fork_pipeline(pipe) + + # create extractor + return cls(max_parallel_items, workers, futures_poll_interval, sources, next_item_mode) + + def __next__(self) -> PipeItem: + pipe_item: Union[ResolvablePipeItem, SourcePipeItem] = None + # __next__ should call itself to remove the `while` loop and continue clauses but that may lead to stack overflows: there's no tail recursion opt in python + # https://stackoverflow.com/questions/13591970/does-python-optimize-tail-recursion (see Y combinator on how it could be emulated) + while True: + # do we need new item? + if pipe_item is None: + # process element from the futures pool if one is ready + # if len(self._worker_pool) > 0: + pipe_item = self._worker_pool.resolve_next_future_no_wait() + # if none then take element from the newest source + if pipe_item is None: + pipe_item = self._get_source_item() + + if pipe_item is None: + if self._worker_pool.empty and len(self._sources) == 0: + # no more elements in futures or sources + raise StopIteration() + else: + # wait for some future to complete + pipe_item = self._worker_pool.resolve_next_future() + + if pipe_item is None: + continue + + item = pipe_item.item + # if item is iterator, then add it as a new source + if isinstance(item, Iterator): + # print(f"adding iterable {item}") + self._sources.append( + SourcePipeItem(item, pipe_item.step, pipe_item.pipe, pipe_item.meta) + ) + pipe_item = None + continue + + # handle async iterator items as new source + if isinstance(item, AsyncIterator): + self._sources.append( + SourcePipeItem( + wrap_async_iterator(item), pipe_item.step, pipe_item.pipe, pipe_item.meta + ), + ) + pipe_item = None + continue + + if isinstance(item, Awaitable) or callable(item): + # TODO: Duplicated from logic in get_source_item + # The item is a callable that runs in the futures pool + future: Optional[TItemFuture] = None + while future is None: + future = self._worker_pool.submit(pipe_item) # type: ignore[arg-type] + if future is None: + # worker pool is full, wait until a slot becomes free + self._worker_pool.wait_for_free_slot() + else: + pipe_item = None + continue + + # if we are at the end of the pipe then yield element + if pipe_item.step == len(pipe_item.pipe) - 1: + # must be resolved + if isinstance(item, (Iterator, Awaitable, AsyncIterator)) or callable(item): + raise PipeItemProcessingError( + pipe_item.pipe.name, + f"Pipe item at step {pipe_item.step} was not fully evaluated and is of type" + f" {type(pipe_item.item).__name__}. This is internal error or you are" + " yielding something weird from resources ie. functions or awaitables.", + ) + # mypy not able to figure out that item was resolved + return pipe_item # type: ignore + + # advance to next step + step = pipe_item.pipe[pipe_item.step + 1] + try: + set_current_pipe_name(pipe_item.pipe.name) + next_meta = pipe_item.meta + next_item = step(item, meta=pipe_item.meta) # type: ignore + if isinstance(next_item, DataItemWithMeta): + next_meta = next_item.meta + next_item = next_item.data + except TypeError as ty_ex: + assert callable(step) + raise InvalidStepFunctionArguments( + pipe_item.pipe.name, + get_callable_name(step), + inspect.signature(step), + str(ty_ex), + ) + except (PipelineException, ExtractorException, DltSourceException, PipeException): + raise + except Exception as ex: + raise ResourceExtractionError( + pipe_item.pipe.name, step, str(ex), "transform" + ) from ex + # create next pipe item if a value was returned. A None means that item was consumed/filtered out and should not be further processed + if next_item is not None: + pipe_item = ResolvablePipeItem( + next_item, pipe_item.step + 1, pipe_item.pipe, next_meta + ) + else: + pipe_item = None + + def _get_source_item(self) -> ResolvablePipeItem: + sources_count = len(self._sources) + # no more sources to iterate + if sources_count == 0: + return None + try: + first_evaluated_index: int = None + # always reset to end of list for fifo mode, also take into account that new sources can be added + # if too many new sources is added we switch to fifo not to exhaust them + if ( + self._next_item_mode == "fifo" + or sources_count - self._initial_sources_count + >= self._worker_pool.max_parallel_items + ): + self._current_source_index = sources_count - 1 + else: + self._current_source_index = (self._current_source_index - 1) % sources_count + while True: + # if we have checked all sources once and all returned None, then lets sleep a bit or wait for a free worker slot + if self._current_source_index == first_evaluated_index: + self._worker_pool.wait_for_free_slot() + # get next item from the current source + # gen, step, pipe, meta = self._sources[self._current_source_index] + gen, step, pipe, meta = self._sources[self._current_source_index] + set_current_pipe_name(pipe.name) + + pipe_item = next(gen) + if pipe_item is not None: + # full pipe item may be returned, this is used by ForkPipe step + # to redirect execution of an item to another pipe + # else + if not isinstance(pipe_item, ResolvablePipeItem): + # keep the item assigned step and pipe when creating resolvable item + if isinstance(pipe_item, DataItemWithMeta): + pipe_item = ResolvablePipeItem( + pipe_item.data, step, pipe, pipe_item.meta + ) + else: + pipe_item = ResolvablePipeItem(pipe_item, step, pipe, meta) + + if isinstance(pipe_item.item, Awaitable) or callable(pipe_item.item): + # The item is a callable that runs in the futures pool + future: Optional[TItemFuture] = None + while future is None: + future = self._worker_pool.submit(pipe_item) + if future is None: + # worker pool is full, wait until a slot becomes free + self._worker_pool.wait_for_free_slot() + pipe_item = None + if len(self._worker_pool) >= sources_count: + # Return here so we're not collecting done futures forever + return None + # Otherwhise we continue to the next source + + if pipe_item is not None: + return pipe_item + + # remember the first evaluated index + if first_evaluated_index is None: + first_evaluated_index = self._current_source_index + # always go round robin if None was returned or item is to be run as future + self._current_source_index = (self._current_source_index - 1) % sources_count + + except StopIteration: + # remove empty iterator and try another source + self._sources.pop(self._current_source_index) + # decrease initial source count if we popped an initial source + if self._current_source_index < self._initial_sources_count: + self._initial_sources_count -= 1 + return self._get_source_item() + except (PipelineException, ExtractorException, DltSourceException, PipeException): + raise + except Exception as ex: + raise ResourceExtractionError(pipe.name, gen, str(ex), "generator") from ex + + def close(self) -> None: + # unregister the pipe name right after execution of gen stopped + unset_current_pipe_name() + + def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: + loop.stop() + + # close all generators + for gen, _, _, _ in self._sources: + if inspect.isgenerator(gen): + gen.close() + self._sources.clear() + + self._worker_pool.close() + + def __enter__(self) -> "PipeIterator": + return self + + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: types.TracebackType + ) -> None: + self.close() + + @staticmethod + def clone_pipes( + pipes: Sequence[Pipe], existing_cloned_pairs: Dict[int, Pipe] = None + ) -> Tuple[List[Pipe], Dict[int, Pipe]]: + """This will clone pipes and fix the parent/dependent references""" + cloned_pipes = [p._clone() for p in pipes if id(p) not in (existing_cloned_pairs or {})] + cloned_pairs = {id(p): c for p, c in zip(pipes, cloned_pipes)} + if existing_cloned_pairs: + cloned_pairs.update(existing_cloned_pairs) + + for clone in cloned_pipes: + while True: + if not clone.parent: + break + # if already a clone + if clone.parent in cloned_pairs.values(): + break + # clone if parent pipe not yet cloned + parent_id = id(clone.parent) + if parent_id not in cloned_pairs: + # print("cloning:" + clone.parent.name) + cloned_pairs[parent_id] = clone.parent._clone() + # replace with clone + # print(f"replace depends on {clone.name} to {clone.parent.name}") + clone.parent = cloned_pairs[parent_id] + # recur with clone + clone = clone.parent + + return cloned_pipes, cloned_pairs + + +class ManagedPipeIterator(PipeIterator): + """A version of the pipe iterator that gets closed automatically on an exception in _next_""" + + _ctx: List[ContainerInjectableContext] = None + _container: Container = None + + def set_context(self, ctx: List[ContainerInjectableContext]) -> None: + """Sets list of injectable contexts that will be injected into Container for each call to __next__""" + self._ctx = ctx + self._container = Container() + + def __next__(self) -> PipeItem: + if self._ctx: + managers = [self._container.injectable_context(ctx) for ctx in self._ctx] + for manager in managers: + manager.__enter__() + try: + item = super().__next__() + except Exception as ex: + # release context manager + if self._ctx: + if isinstance(ex, StopIteration): + for manager in managers: + manager.__exit__(None, None, None) + else: + for manager in managers: + manager.__exit__(type(ex), ex, None) + # crash in next + self.close() + raise + if self._ctx: + for manager in managers: + manager.__exit__(None, None, None) + return item diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 72723e60ee..cbd7a3da4e 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -36,7 +36,8 @@ YieldMapItem, ValidateItem, ) -from dlt.extract.pipe import Pipe, ManagedPipeIterator, TPipeStep +from dlt.extract.pipe_iterator import ManagedPipeIterator +from dlt.extract.pipe import Pipe, TPipeStep from dlt.extract.hints import DltResourceHints, HintsMeta, TResourceHints from dlt.extract.incremental import Incremental, IncrementalResourceWrapper from dlt.extract.exceptions import ( diff --git a/dlt/extract/source.py b/dlt/extract/source.py index bc33394d4d..10dc4c9b9a 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -24,7 +24,8 @@ from dlt.common.utils import graph_find_scc_nodes, flatten_list_or_items, graph_edges_to_nodes from dlt.extract.typing import TDecompositionStrategy -from dlt.extract.pipe import Pipe, ManagedPipeIterator +from dlt.extract.pipe_iterator import ManagedPipeIterator +from dlt.extract.pipe import Pipe from dlt.extract.hints import DltResourceHints, make_hints from dlt.extract.resource import DltResource from dlt.extract.exceptions import ( diff --git a/dlt/reflection/script_inspector.py b/dlt/reflection/script_inspector.py index d8d96804c8..f9068d31e4 100644 --- a/dlt/reflection/script_inspector.py +++ b/dlt/reflection/script_inspector.py @@ -13,7 +13,7 @@ from dlt.pipeline import Pipeline from dlt.extract import DltSource -from dlt.extract.pipe import ManagedPipeIterator +from dlt.extract.pipe_iterator import ManagedPipeIterator def patch__init__(self: Any, *args: Any, **kwargs: Any) -> None: diff --git a/tests/extract/test_extract_pipe.py b/tests/extract/test_extract_pipe.py index 38dc8a9319..d5be5808c0 100644 --- a/tests/extract/test_extract_pipe.py +++ b/tests/extract/test_extract_pipe.py @@ -11,7 +11,8 @@ from dlt.common.typing import TDataItems from dlt.extract.exceptions import CreatePipeException, ResourceExtractionError from dlt.extract.typing import DataItemWithMeta, FilterItem, MapItem, YieldMapItem -from dlt.extract.pipe import ManagedPipeIterator, Pipe, PipeItem, PipeIterator +from dlt.extract.pipe import Pipe +from dlt.extract.pipe_iterator import PipeIterator, ManagedPipeIterator, PipeItem def test_next_item_mode() -> None: From deea305f4327c9a1d0b955dae6ac41795538409c Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Wed, 14 Feb 2024 14:57:55 -0500 Subject: [PATCH 07/27] Overload arg order --- dlt/extract/decorators.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 17734d1a1a..39b378743e 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -315,8 +315,8 @@ def resource( table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, - standalone: Literal[True] = True, parallelized: bool = False, + standalone: Literal[True] = True, ) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, DltResource]]: ... @@ -353,9 +353,9 @@ def resource( table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, + parallelized: bool = False, standalone: bool = False, data_from: TUnboundDltResource = None, - parallelized: bool = False, ) -> Any: """When used as a decorator, transforms any generator (yielding) function into a `dlt resource`. When used as a function, it transforms data in `data` argument into a `dlt resource`. @@ -412,6 +412,8 @@ def resource( data_from (TUnboundDltResource, optional): Allows to pipe data from one resource to another to build multi-step pipelines. + parallelized (bool, optional): If `True`, the resource generator will be extracted in parallel with other resources. Defaults to `False`. + Raises: ResourceNameMissing: indicates that name of the resource cannot be inferred from the `data` being passed. InvalidResourceDataType: indicates that the `data` argument cannot be converted into `dlt resource` From 9820a54a2955e73dcf49aa49a2b90d22b7058b84 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Wed, 14 Feb 2024 15:19:05 -0500 Subject: [PATCH 08/27] Keep parenthesis --- dlt/extract/pipe_iterator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index b494d45709..81fb0113cd 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -253,10 +253,8 @@ def _get_source_item(self) -> ResolvablePipeItem: first_evaluated_index: int = None # always reset to end of list for fifo mode, also take into account that new sources can be added # if too many new sources is added we switch to fifo not to exhaust them - if ( - self._next_item_mode == "fifo" - or sources_count - self._initial_sources_count - >= self._worker_pool.max_parallel_items + if self._next_item_mode == "fifo" or ( + sources_count - self._initial_sources_count >= self._worker_pool.max_parallel_items ): self._current_source_index = sources_count - 1 else: From 2cc0cedd49367e01391d1b77de0fac2c90d7640a Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Wed, 14 Feb 2024 15:20:48 -0500 Subject: [PATCH 09/27] Use assert for sanity check --- dlt/extract/concurrency.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlt/extract/concurrency.py b/dlt/extract/concurrency.py index a71b2179d8..9c04556540 100644 --- a/dlt/extract/concurrency.py +++ b/dlt/extract/concurrency.py @@ -101,8 +101,8 @@ def submit(self, pipe_item: ResolvablePipeItem) -> Optional[TItemFuture]: if self.free_slots == 0: return None - if self.free_slots < 0: # TODO: Sanity check during dev, should never happen - raise RuntimeError("Used more slots than max_parallel_items") + # Sanity check, negative free slots means there's a bug somewhere + assert self.free_slots >= 0, "Worker pool has negative free slots, this should never happen" # submit to thread pool or async pool item = pipe_item.item From fba59f600b3ce5b637ac5d36433efa4d6cde2108 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Wed, 14 Feb 2024 17:24:42 -0500 Subject: [PATCH 10/27] Parallelize resource method, handle transformers --- dlt/common/typing.py | 2 + dlt/extract/decorators.py | 87 ++++++--------------- dlt/extract/exceptions.py | 7 +- dlt/extract/resource.py | 16 +++- dlt/extract/source.py | 5 ++ dlt/extract/typing.py | 7 ++ dlt/extract/utils.py | 45 ++++++++++- tests/extract/test_decorators.py | 49 ++++++++++++ tests/pipeline/test_resources_evaluation.py | 39 +++++++++ 9 files changed, 188 insertions(+), 69 deletions(-) diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 9a776bd51d..0682be5062 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -24,6 +24,7 @@ Union, runtime_checkable, IO, + Iterator, ) from typing_extensions import ( @@ -69,6 +70,7 @@ AnyFun: TypeAlias = Callable[..., Any] TFun = TypeVar("TFun", bound=AnyFun) # any function TAny = TypeVar("TAny", bound=Any) +TAnyFunOrIterator = TypeVar("TAnyFunOrIterator", AnyFun, Iterator[Any]) TAnyClass = TypeVar("TAnyClass", bound=object) TimedeltaSeconds = Union[int, float, timedelta] # represent secret value ie. coming from Kubernetes/Docker secrets or other providers diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 39b378743e..77b7551f80 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -77,8 +77,7 @@ class SourceSchemaInjectableContext(ContainerInjectableContext): if TYPE_CHECKING: - def __init__(self, schema: Schema = None) -> None: - ... + def __init__(self, schema: Schema = None) -> None: ... @configspec @@ -91,8 +90,7 @@ class SourceInjectableContext(ContainerInjectableContext): if TYPE_CHECKING: - def __init__(self, source: DltSource = None) -> None: - ... + def __init__(self, source: DltSource = None) -> None: ... TSourceFunParams = ParamSpec("TSourceFunParams") @@ -112,8 +110,7 @@ def source( schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource, # type: ignore[assignment] -) -> Callable[TSourceFunParams, DltSource]: - ... +) -> Callable[TSourceFunParams, DltSource]: ... @overload @@ -128,8 +125,7 @@ def source( schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource, # type: ignore[assignment] -) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, TDltSourceImpl]]: - ... +) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, TDltSourceImpl]]: ... def source( @@ -278,8 +274,7 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, -) -> DltResource: - ... +) -> DltResource: ... @overload @@ -297,8 +292,7 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, -) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: - ... +) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: ... @overload @@ -317,8 +311,7 @@ def resource( spec: Type[BaseConfiguration] = None, parallelized: bool = False, standalone: Literal[True] = True, -) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, DltResource]]: - ... +) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, DltResource]]: ... @overload @@ -336,8 +329,7 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, -) -> DltResource: - ... +) -> DltResource: ... def resource( @@ -434,7 +426,7 @@ def make_resource( schema_contract=schema_contract, table_format=table_format, ) - return DltResource.from_data( + resource = DltResource.from_data( _data, _name, _section, @@ -443,6 +435,9 @@ def make_resource( cast(DltResource, data_from), incremental=incremental, ) + if parallelized: + return resource.parallelize() + return resource def decorator( f: Callable[TResourceFunParams, Any] @@ -457,8 +452,8 @@ def decorator( if not standalone and callable(name): raise DynamicNameNotStandaloneResource(get_callable_name(f)) - if parallelized: - f = parallelize(f) + # if parallelized: + # f = parallelize(f) # resource_section = name if name and not callable(name) else get_callable_name(f) resource_name = name if name and not callable(name) else get_callable_name(f) @@ -555,8 +550,8 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, -) -> Callable[[Callable[Concatenate[TDataItem, TResourceFunParams], Any]], DltResource]: - ... + parallelized: bool = False, +) -> Callable[[Callable[Concatenate[TDataItem, TResourceFunParams], Any]], DltResource]: ... @overload @@ -573,11 +568,11 @@ def transformer( selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: Literal[True] = True, + parallelized: bool = False, ) -> Callable[ [Callable[Concatenate[TDataItem, TResourceFunParams], Any]], Callable[TResourceFunParams, DltResource], -]: - ... +]: ... @overload @@ -593,8 +588,8 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, -) -> DltResource: - ... + parallelized: bool = False, +) -> DltResource: ... @overload @@ -611,8 +606,8 @@ def transformer( selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: Literal[True] = True, -) -> Callable[TResourceFunParams, DltResource]: - ... + parallelized: bool = False, +) -> Callable[TResourceFunParams, DltResource]: ... def transformer( @@ -628,6 +623,7 @@ def transformer( selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: bool = False, + parallelized: bool = False, ) -> Any: """A form of `dlt resource` that takes input from other resources via `data_from` argument in order to enrich or transform the data. @@ -701,6 +697,7 @@ def transformer( spec=spec, standalone=standalone, data_from=data_from, + parallelized=parallelized, ) @@ -758,39 +755,3 @@ def _curry() -> TBoundItems: return _curry return _wrap - - -def parallelize(f: Callable[TResourceFunParams, Any]) -> Callable[TResourceFunParams, Any]: - @wraps(f) - def _wrap(*args: Any, **kwargs: Any) -> Any: # TODO: Type correctly - gen = f(*args, **kwargs) - if inspect.isfunction(gen): - gen = gen() - if inspect.isasyncgen(gen): - raise ValueError("Async generators are not supported with paralellize") - - exhausted = False - busy = False - - def _parallel_gen() -> Any: # TODO: Type correctly - try: - return next(gen) - except StopIteration: - nonlocal exhausted - exhausted = True - return None - finally: - nonlocal busy - busy = False - - while not exhausted: - try: - while busy: - yield None - busy = True - yield _parallel_gen - except GeneratorExit: - gen.close() - raise - - return _wrap diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py index de785865c5..1db79af0b6 100644 --- a/dlt/extract/exceptions.py +++ b/dlt/extract/exceptions.py @@ -144,15 +144,14 @@ def __init__(self, resource_name: str, item: Any, _typ: Type[Any], msg: str) -> ) -class InvalidResourceDataTypeAsync(InvalidResourceDataType): +class InvalidParallelResourceDataType(InvalidResourceDataType): def __init__(self, resource_name: str, item: Any, _typ: Type[Any]) -> None: super().__init__( resource_name, item, _typ, - "Async iterators and generators are not valid resources. Please use standard iterators" - " and generators that yield Awaitables instead (for example by yielding from async" - " function without await", + "Parallel resource data must be a generator or a generator function. The provided" + f" data type for resource '{resource_name}' was {_typ.__name__}.", ) diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index cbd7a3da4e..a01ee0a53b 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -24,7 +24,7 @@ pipeline_state, ) from dlt.common.utils import flatten_list_or_items, get_callable_name, uniq_id -from dlt.extract.utils import wrap_async_iterator +from dlt.extract.utils import wrap_async_iterator, wrap_parallel_iterator from dlt.extract.typing import ( DataItemWithMeta, @@ -49,6 +49,7 @@ InvalidTransformerGeneratorFunction, InvalidResourceDataTypeBasic, InvalidResourceDataTypeMultiplePipes, + InvalidParallelResourceDataType, ParametrizedResourceUnbound, ResourceNameMissing, ResourceNotATransformer, @@ -342,6 +343,19 @@ def _gen_wrap(gen: TPipeStep) -> TPipeStep: self._pipe.replace_gen(_gen_wrap(self._pipe.gen)) return self + def parallelize(self) -> "DltResource": + """Wraps the resource to execute each item in a threadpool to allow multiple resources to extract in parallel.""" + + if ( + not inspect.isgenerator(self._pipe.gen) + and not inspect.isgeneratorfunction(self._pipe.gen) + and not self.is_transformer + ): + raise InvalidParallelResourceDataType(self.name, self._pipe.gen, type(self._pipe.gen)) + + self._pipe.replace_gen(wrap_parallel_iterator(self._pipe.gen)) + return self + def add_step( self, item_transform: ItemTransformFunctionWithMeta[TDataItems], insert_at: int = None ) -> "DltResource": # noqa: A003 diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 10dc4c9b9a..30369512bc 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -334,6 +334,11 @@ def add_limit(self, max_items: int) -> "DltSource": # noqa: A003 resource.add_limit(max_items) return self + def parallelize(self) -> "DltSource": + for resource in self.resources.selected.values(): + resource.parallelize() + return self + @property def run(self) -> SupportsPipelineRun: """A convenience method that will call `run` run on the currently active `dlt` pipeline. If pipeline instance is not found, one with default settings will be created.""" diff --git a/dlt/extract/typing.py b/dlt/extract/typing.py index a91a91f7ee..961b7d1366 100644 --- a/dlt/extract/typing.py +++ b/dlt/extract/typing.py @@ -12,6 +12,7 @@ Union, Awaitable, TYPE_CHECKING, + Generator, ) from concurrent.futures import Future @@ -28,6 +29,12 @@ TTableHintTemplate = Union[TDynHintType, TFunHintTemplate[TDynHintType]] +TGenOrGenFunction = Union[ + Generator[TDataItems, Optional[Any], Optional[Any]], + Callable[..., Generator[TDataItems, Optional[Any], Optional[Any]]], +] # ] + + class DataItemWithMeta: __slots__ = "meta", "data" diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py index 0e86994eb4..77c0bcbc65 100644 --- a/dlt/extract/utils.py +++ b/dlt/extract/utils.py @@ -13,8 +13,10 @@ AsyncGenerator, Awaitable, Generator, + Iterator, ) from collections.abc import Mapping as C_Mapping +from functools import wraps from dlt.common.exceptions import MissingDependencyException from dlt.common.pipeline import reset_resource_state @@ -26,7 +28,13 @@ InvalidResourceDataTypeFunctionNotAGenerator, InvalidStepFunctionArguments, ) -from dlt.extract.typing import TTableHintTemplate, TDataItem, TFunHintTemplate, SupportsPipe +from dlt.extract.typing import ( + TTableHintTemplate, + TDataItem, + TFunHintTemplate, + SupportsPipe, + TGenOrGenFunction, +) try: from dlt.common.libs import pydantic @@ -171,6 +179,41 @@ async def run() -> TDataItems: exhausted = True +def wrap_parallel_iterator(f: TGenOrGenFunction) -> TGenOrGenFunction: + """Wraps a generator for parallel extraction""" + + def _wrapper(*args: Any, **kwargs: Any) -> Generator[TDataItems, None, None]: + gen = f(*args, **kwargs) if callable(f) else f + + exhausted = False + busy = False + + def _parallel_gen() -> TDataItems: + nonlocal busy + try: + return next(gen) + except StopIteration: + nonlocal exhausted + exhausted = True + return None + finally: + busy = False + + while not exhausted: + try: + while busy: + yield None + busy = True + yield _parallel_gen + except GeneratorExit: + # gen.close() + raise + + if callable(f): + return wraps(f)(_wrapper) + return _wrapper() + + def wrap_compat_transformer( name: str, f: AnyFun, sig: inspect.Signature, *args: Any, **kwargs: Any ) -> AnyFun: diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 3c15bf37f5..364d2a4294 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -36,6 +36,7 @@ SourceIsAClassTypeError, SourceNotAFunction, CurrentSourceSchemaNotAvailable, + InvalidParallelResourceDataType, ) from dlt.extract.typing import TableNameMeta @@ -853,3 +854,51 @@ def __call__(self, more: int = 1): @pytest.mark.skip("Not implemented") def test_class_resource() -> None: pass + + +def test_parallelized_resource_decorator() -> None: + """Test paralellized resources are wrapped correctly. + Note: tests for parallel execution are in test_resource_evaluation + """ + + def some_gen(): + yield from [1, 2, 3] + + # Create resource with decorated function + resource = dlt.resource(some_gen, parallelized=True) + + # Generator func is wrapped with parallelized gen that yields callables + result = next(resource._pipe.gen()) # type: ignore + assert result() == 1 + + # Same but wrapping generator directly + resource = dlt.resource(some_gen(), parallelized=True) + + result = next(resource._pipe.gen) # type: ignore + assert result() == 1 + + # Wrap a transformer + def some_tx(item): + yield item + 1 + + resource = dlt.resource(some_gen, parallelized=True) + + transformer = dlt.transformer(some_tx, parallelized=True, data_from=resource) + pipe_gen = transformer._pipe.gen + # Calling transformer returns the parallel wrapper generator + inner = pipe_gen(1) # type: ignore + assert next(inner)() == 2 # type: ignore + + # Invalid parallel resources + + # From async generator + with pytest.raises(InvalidParallelResourceDataType): + + @dlt.resource(parallelized=True) + async def some_data(): + yield 1 + yield 2 + + # From list + with pytest.raises(InvalidParallelResourceDataType): + dlt.resource([1, 2, 3], name="T", parallelized=True) diff --git a/tests/pipeline/test_resources_evaluation.py b/tests/pipeline/test_resources_evaluation.py index f3e2734a14..57615ad039 100644 --- a/tests/pipeline/test_resources_evaluation.py +++ b/tests/pipeline/test_resources_evaluation.py @@ -323,3 +323,42 @@ def some_data(resource_num: int): assert len(set(chunk)) == len(chunk) # Items are extracted in order per resource assert chunk == sorted(chunk) + + +def test_test_parallelized_transformers() -> None: + exec_order = [] + item_count = 6 + + @dlt.source + def some_source(): + @dlt.resource(parallelized=True) + def pos_data(): + for i in range(1, item_count + 1): + time.sleep(0.1) + yield i + + @dlt.resource(parallelized=True) + def neg_data(): + time.sleep(0.05) + for i in range(-1, -item_count - 1, -1): + time.sleep(0.1) + yield i + + @dlt.transformer(parallelized=True) + def multiply(item): + time.sleep(0.05) + exec_order.append("+" if item > 0 else "-") + yield item * 10 + + return [ + neg_data | multiply.with_name("t_a"), + pos_data | multiply.with_name("t_b"), + ] + + result = list(some_source()) + + expected_result = [i * 10 for i in range(-item_count, item_count + 1)] + expected_result.remove(0) + + assert sorted(result) == expected_result + assert exec_order == ["+", "-"] * item_count From ea882deaf1df098410b4089d933888c4ffb74e81 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 15 Feb 2024 18:30:05 -0500 Subject: [PATCH 11/27] Comments, small refactor --- dlt/extract/concurrency.py | 28 ++++++++++++++++++++++++---- dlt/extract/pipe_iterator.py | 24 ++++++------------------ 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/dlt/extract/concurrency.py b/dlt/extract/concurrency.py index 9c04556540..c8653c571d 100644 --- a/dlt/extract/concurrency.py +++ b/dlt/extract/concurrency.py @@ -1,6 +1,7 @@ import asyncio from uuid import uuid4 from asyncio import Future +from contextlib import contextmanager from concurrent.futures import ( ThreadPoolExecutor, as_completed, @@ -8,7 +9,7 @@ FIRST_COMPLETED, ) from threading import Thread -from typing import List, Awaitable, Callable, Any, Dict, Set, Optional +from typing import List, Awaitable, Callable, Any, Dict, Set, Optional, overload, Literal from dlt.common.exceptions import PipelineException from dlt.common.configuration.container import Container @@ -89,21 +90,39 @@ def _vacate_slot(self, _: TItemFuture) -> None: # Used as callback to free up slot when future is done self.used_slots -= 1 - def submit(self, pipe_item: ResolvablePipeItem) -> Optional[TItemFuture]: + @overload + def submit(self, pipe_item: ResolvablePipeItem, block: Literal[False]) -> Optional[TItemFuture]: + ... + + @overload + def submit(self, pipe_item: ResolvablePipeItem, block: Literal[True]) -> TItemFuture: + ... + + def submit(self, pipe_item: ResolvablePipeItem, block: bool = False) -> Optional[TItemFuture]: """Submit an item to the pool. Args: pipe_item: The item to submit. + block: If True, block until there's a free slot in the pool. Returns: The future if the item was successfully submitted, otherwise None. """ - if self.free_slots == 0: - return None # Sanity check, negative free slots means there's a bug somewhere assert self.free_slots >= 0, "Worker pool has negative free slots, this should never happen" + if self.free_slots == 0: + if block: + # Wait until some future is completed to ensure there's a free slot + # Note: This is probably not thread safe. If ever multiple threads will be submitting + # jobs to the pool, we ned to change this whole method to be inside a `threading.Lock` + self.wait_for_free_slot() + else: + return None + + future: Optional[TItemFuture] = None + # submit to thread pool or async pool item = pipe_item.item if isinstance(item, Awaitable): @@ -154,6 +173,7 @@ def resolve_next_future(self) -> Optional[ResolvablePipeItem]: return None if (item := self.resolve_next_future_no_wait()) is not None: + # When there are multiple already done futures from the same pipe we return results in insertion order return item for future in as_completed(self.futures): if future.cancelled(): diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index 81fb0113cd..3ac48c080f 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -150,7 +150,6 @@ def __next__(self) -> PipeItem: # do we need new item? if pipe_item is None: # process element from the futures pool if one is ready - # if len(self._worker_pool) > 0: pipe_item = self._worker_pool.resolve_next_future_no_wait() # if none then take element from the newest source if pipe_item is None: @@ -188,16 +187,10 @@ def __next__(self) -> PipeItem: continue if isinstance(item, Awaitable) or callable(item): - # TODO: Duplicated from logic in get_source_item - # The item is a callable that runs in the futures pool - future: Optional[TItemFuture] = None - while future is None: - future = self._worker_pool.submit(pipe_item) # type: ignore[arg-type] - if future is None: - # worker pool is full, wait until a slot becomes free - self._worker_pool.wait_for_free_slot() - else: - pipe_item = None + # Callables that come from following pipe steps need to be added to the pool + self._worker_pool.submit(pipe_item, block=True) # type: ignore[call-overload] + pipe_item = None + # Future will be resolved later, move on to the next item continue # if we are at the end of the pipe then yield element @@ -283,13 +276,8 @@ def _get_source_item(self) -> ResolvablePipeItem: pipe_item = ResolvablePipeItem(pipe_item, step, pipe, meta) if isinstance(pipe_item.item, Awaitable) or callable(pipe_item.item): - # The item is a callable that runs in the futures pool - future: Optional[TItemFuture] = None - while future is None: - future = self._worker_pool.submit(pipe_item) - if future is None: - # worker pool is full, wait until a slot becomes free - self._worker_pool.wait_for_free_slot() + # Send callables to the worker pool right away, collect futures from multiple sources in one iteration + self._worker_pool.submit(pipe_item, block=True) pipe_item = None if len(self._worker_pool) >= sources_count: # Return here so we're not collecting done futures forever From 6bcd43ddf200fea8b6a442649fab35b4f5312ad5 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 15 Feb 2024 19:48:11 -0500 Subject: [PATCH 12/27] Handle non-iterator transformers --- dlt/extract/concurrency.py | 8 +-- dlt/extract/decorators.py | 6 +- dlt/extract/resource.py | 2 +- dlt/extract/typing.py | 6 -- dlt/extract/utils.py | 31 +++++++--- tests/pipeline/test_resources_evaluation.py | 64 ++++++++++++++------- 6 files changed, 75 insertions(+), 42 deletions(-) diff --git a/dlt/extract/concurrency.py b/dlt/extract/concurrency.py index c8653c571d..219a5ba47c 100644 --- a/dlt/extract/concurrency.py +++ b/dlt/extract/concurrency.py @@ -91,12 +91,12 @@ def _vacate_slot(self, _: TItemFuture) -> None: self.used_slots -= 1 @overload - def submit(self, pipe_item: ResolvablePipeItem, block: Literal[False]) -> Optional[TItemFuture]: - ... + def submit( + self, pipe_item: ResolvablePipeItem, block: Literal[False] + ) -> Optional[TItemFuture]: ... @overload - def submit(self, pipe_item: ResolvablePipeItem, block: Literal[True]) -> TItemFuture: - ... + def submit(self, pipe_item: ResolvablePipeItem, block: Literal[True]) -> TItemFuture: ... def submit(self, pipe_item: ResolvablePipeItem, block: bool = False) -> Optional[TItemFuture]: """Submit an item to the pool. diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 77b7551f80..6c6c6969e4 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -567,8 +567,8 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, - standalone: Literal[True] = True, parallelized: bool = False, + standalone: Literal[True] = True, ) -> Callable[ [Callable[Concatenate[TDataItem, TResourceFunParams], Any]], Callable[TResourceFunParams, DltResource], @@ -605,8 +605,8 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, - standalone: Literal[True] = True, parallelized: bool = False, + standalone: Literal[True] = True, ) -> Callable[TResourceFunParams, DltResource]: ... @@ -622,8 +622,8 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, - standalone: bool = False, parallelized: bool = False, + standalone: bool = False, ) -> Any: """A form of `dlt resource` that takes input from other resources via `data_from` argument in order to enrich or transform the data. diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index a01ee0a53b..d99dba7db3 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -349,7 +349,7 @@ def parallelize(self) -> "DltResource": if ( not inspect.isgenerator(self._pipe.gen) and not inspect.isgeneratorfunction(self._pipe.gen) - and not self.is_transformer + and not (callable(self._pipe.gen) and self.is_transformer) ): raise InvalidParallelResourceDataType(self.name, self._pipe.gen, type(self._pipe.gen)) diff --git a/dlt/extract/typing.py b/dlt/extract/typing.py index 961b7d1366..eadbd449b9 100644 --- a/dlt/extract/typing.py +++ b/dlt/extract/typing.py @@ -29,12 +29,6 @@ TTableHintTemplate = Union[TDynHintType, TFunHintTemplate[TDynHintType]] -TGenOrGenFunction = Union[ - Generator[TDataItems, Optional[Any], Optional[Any]], - Callable[..., Generator[TDataItems, Optional[Any], Optional[Any]]], -] # ] - - class DataItemWithMeta: __slots__ = "meta", "data" diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py index 77c0bcbc65..9c17195c65 100644 --- a/dlt/extract/utils.py +++ b/dlt/extract/utils.py @@ -16,7 +16,7 @@ Iterator, ) from collections.abc import Mapping as C_Mapping -from functools import wraps +from functools import wraps, partial from dlt.common.exceptions import MissingDependencyException from dlt.common.pipeline import reset_resource_state @@ -33,7 +33,6 @@ TDataItem, TFunHintTemplate, SupportsPipe, - TGenOrGenFunction, ) try: @@ -179,21 +178,32 @@ async def run() -> TDataItems: exhausted = True -def wrap_parallel_iterator(f: TGenOrGenFunction) -> TGenOrGenFunction: +def wrap_parallel_iterator( + f: Union[Generator[TDataItems, Optional[Any], Optional[Any]], AnyFun] +) -> Union[Generator[TDataItems, Optional[Any], Optional[Any]], AnyFun]: """Wraps a generator for parallel extraction""" def _wrapper(*args: Any, **kwargs: Any) -> Generator[TDataItems, None, None]: - gen = f(*args, **kwargs) if callable(f) else f + is_generator = True + gen: Union[Generator[TDataItems, Optional[Any], Optional[Any]], AnyFun] + if callable(f): + if inspect.isgeneratorfunction(f): + gen = f(*args, **kwargs) + else: + is_generator = False + gen = f + else: + gen = f exhausted = False busy = False def _parallel_gen() -> TDataItems: nonlocal busy + nonlocal exhausted try: - return next(gen) + return next(gen) # type: ignore[arg-type] except StopIteration: - nonlocal exhausted exhausted = True return None finally: @@ -204,9 +214,14 @@ def _parallel_gen() -> TDataItems: while busy: yield None busy = True - yield _parallel_gen + if is_generator: + yield _parallel_gen + else: + exhausted = True + yield partial(gen, *args, **kwargs) # type: ignore[arg-type] except GeneratorExit: - # gen.close() + if is_generator: + gen.close() # type: ignore[union-attr] raise if callable(f): diff --git a/tests/pipeline/test_resources_evaluation.py b/tests/pipeline/test_resources_evaluation.py index 57615ad039..d57ae327c4 100644 --- a/tests/pipeline/test_resources_evaluation.py +++ b/tests/pipeline/test_resources_evaluation.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, List import time import threading import random @@ -326,35 +326,59 @@ def some_data(resource_num: int): def test_test_parallelized_transformers() -> None: - exec_order = [] item_count = 6 + @dlt.resource(parallelized=True) + def pos_data(): + for i in range(1, item_count + 1): + time.sleep(0.1) + yield i + + @dlt.resource(parallelized=True) + def neg_data(): + time.sleep(0.05) + for i in range(-1, -item_count - 1, -1): + time.sleep(0.1) + yield i + + @dlt.transformer(parallelized=True) + def multiply(item): + time.sleep(0.05) + exec_order.append("+" if item > 0 else "-") + yield item * 10 + @dlt.source def some_source(): - @dlt.resource(parallelized=True) - def pos_data(): - for i in range(1, item_count + 1): - time.sleep(0.1) - yield i - - @dlt.resource(parallelized=True) - def neg_data(): - time.sleep(0.05) - for i in range(-1, -item_count - 1, -1): - time.sleep(0.1) - yield i - - @dlt.transformer(parallelized=True) - def multiply(item): - time.sleep(0.05) - exec_order.append("+" if item > 0 else "-") - yield item * 10 + return [ + neg_data | multiply.with_name("t_a"), + pos_data | multiply.with_name("t_b"), + ] + + exec_order: List[str] = [] + result = list(some_source()) + + expected_result = [i * 10 for i in range(-item_count, item_count + 1)] + expected_result.remove(0) + + assert sorted(result) == expected_result + assert exec_order == ["+", "-"] * item_count + @dlt.transformer(parallelized=True) # type: ignore[no-redef] + def multiply(item): + # Transformer that is not a generator + time.sleep(0.05) + exec_order.append("+" if item > 0 else "-") + return item * 10 + + @dlt.source # type: ignore[no-redef] + def some_source(): return [ neg_data | multiply.with_name("t_a"), pos_data | multiply.with_name("t_b"), ] + exec_order = [] + result = list(some_source()) expected_result = [i * 10 for i in range(-item_count, item_count + 1)] From 84f36189b0491869f8c67d3f20c27b9851e0104d Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Mon, 19 Feb 2024 15:51:30 -0500 Subject: [PATCH 13/27] Rename WorkerPool -> FuturesPool, small cleanups --- dlt/extract/concurrency.py | 17 ++++++++++------- dlt/extract/decorators.py | 3 --- dlt/extract/pipe_iterator.py | 33 +++++++++++++-------------------- 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/dlt/extract/concurrency.py b/dlt/extract/concurrency.py index 219a5ba47c..e431d2c36a 100644 --- a/dlt/extract/concurrency.py +++ b/dlt/extract/concurrency.py @@ -25,7 +25,7 @@ ) -class WorkerPool: +class FuturesPool: """Worker pool for pipe items that can be resolved asynchronously. Items can be either asyncio coroutines or regular callables which will be executed in a thread pool. @@ -117,7 +117,7 @@ def submit(self, pipe_item: ResolvablePipeItem, block: bool = False) -> Optional # Wait until some future is completed to ensure there's a free slot # Note: This is probably not thread safe. If ever multiple threads will be submitting # jobs to the pool, we ned to change this whole method to be inside a `threading.Lock` - self.wait_for_free_slot() + self._wait_for_free_slot() else: return None @@ -167,14 +167,18 @@ def _resolve_future(self, future: TItemFuture) -> Optional[ResolvablePipeItem]: else: return ResolvablePipeItem(item, step, pipe, meta) + def _next_done_future(self) -> Optional[TItemFuture]: + """Get the done future in the pool (if any). This does not block.""" + return next((fut for fut in self.futures if fut.done() and not fut.cancelled()), None) + def resolve_next_future(self) -> Optional[ResolvablePipeItem]: - """Wait for the next future to be done. Returns None if no futures done.""" + """Block until the next future is done and return the result. Returns None if no futures done.""" if not self.futures: return None - if (item := self.resolve_next_future_no_wait()) is not None: + if (future := self._next_done_future()) is not None: # When there are multiple already done futures from the same pipe we return results in insertion order - return item + return self._resolve_future(future) for future in as_completed(self.futures): if future.cancelled(): # Get the next not-cancelled future @@ -195,10 +199,9 @@ def resolve_next_future_no_wait(self) -> Optional[ResolvablePipeItem]: return self._resolve_future(future) - def wait_for_free_slot(self) -> None: + def _wait_for_free_slot(self) -> None: """Wait until any future in the pool is completed to ensure there's a free slot.""" if self.free_slots >= 1: - self.poll() # Just sleep a bit to give other threads a chance return for future in as_completed(self.futures): diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 6c6c6969e4..72ada35aaf 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -452,9 +452,6 @@ def decorator( if not standalone and callable(name): raise DynamicNameNotStandaloneResource(get_callable_name(f)) - # if parallelized: - # f = parallelize(f) - # resource_section = name if name and not callable(name) else get_callable_name(f) resource_name = name if name and not callable(name) else get_callable_name(f) diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index 3ac48c080f..4f47f0e7d0 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -35,7 +35,7 @@ from dlt.extract.pipe import Pipe from dlt.extract.typing import DataItemWithMeta, TItemFuture from dlt.extract.utils import wrap_async_iterator -from dlt.extract.concurrency import WorkerPool +from dlt.extract.concurrency import FuturesPool from dlt.extract.items import PipeItem, ResolvablePipeItem, SourcePipeItem @@ -65,7 +65,7 @@ def __init__( self._next_item_mode: TPipeNextItemMode = next_item_mode self._initial_sources_count = len(sources) self._current_source_index: int = 0 - self._worker_pool = WorkerPool( + self._futures_pool = FuturesPool( workers=workers, poll_interval=futures_poll_interval, max_parallel_items=max_parallel_items, @@ -149,22 +149,19 @@ def __next__(self) -> PipeItem: while True: # do we need new item? if pipe_item is None: - # process element from the futures pool if one is ready - pipe_item = self._worker_pool.resolve_next_future_no_wait() # if none then take element from the newest source + pipe_item = self._get_source_item() + if pipe_item is None: - pipe_item = self._get_source_item() + # Block until a future is resolved + pipe_item = self._futures_pool.resolve_next_future() if pipe_item is None: - if self._worker_pool.empty and len(self._sources) == 0: + if self._futures_pool.empty and len(self._sources) == 0: # no more elements in futures or sources raise StopIteration() else: - # wait for some future to complete - pipe_item = self._worker_pool.resolve_next_future() - - if pipe_item is None: - continue + continue item = pipe_item.item # if item is iterator, then add it as a new source @@ -188,7 +185,7 @@ def __next__(self) -> PipeItem: if isinstance(item, Awaitable) or callable(item): # Callables that come from following pipe steps need to be added to the pool - self._worker_pool.submit(pipe_item, block=True) # type: ignore[call-overload] + self._futures_pool.submit(pipe_item, block=True) # type: ignore[call-overload] pipe_item = None # Future will be resolved later, move on to the next item continue @@ -247,17 +244,13 @@ def _get_source_item(self) -> ResolvablePipeItem: # always reset to end of list for fifo mode, also take into account that new sources can be added # if too many new sources is added we switch to fifo not to exhaust them if self._next_item_mode == "fifo" or ( - sources_count - self._initial_sources_count >= self._worker_pool.max_parallel_items + sources_count - self._initial_sources_count >= self._futures_pool.max_parallel_items ): self._current_source_index = sources_count - 1 else: self._current_source_index = (self._current_source_index - 1) % sources_count while True: - # if we have checked all sources once and all returned None, then lets sleep a bit or wait for a free worker slot - if self._current_source_index == first_evaluated_index: - self._worker_pool.wait_for_free_slot() # get next item from the current source - # gen, step, pipe, meta = self._sources[self._current_source_index] gen, step, pipe, meta = self._sources[self._current_source_index] set_current_pipe_name(pipe.name) @@ -277,9 +270,9 @@ def _get_source_item(self) -> ResolvablePipeItem: if isinstance(pipe_item.item, Awaitable) or callable(pipe_item.item): # Send callables to the worker pool right away, collect futures from multiple sources in one iteration - self._worker_pool.submit(pipe_item, block=True) + self._futures_pool.submit(pipe_item, block=True) pipe_item = None - if len(self._worker_pool) >= sources_count: + if len(self._futures_pool) >= sources_count: # Return here so we're not collecting done futures forever return None # Otherwhise we continue to the next source @@ -318,7 +311,7 @@ def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: gen.close() self._sources.clear() - self._worker_pool.close() + self._futures_pool.close() def __enter__(self) -> "PipeIterator": return self From c9c18248051131e48ed5040979efe21e59ac8673 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 20 Feb 2024 12:29:02 -0500 Subject: [PATCH 14/27] Fix transformer, test bare generator --- dlt/extract/utils.py | 14 ++++++----- tests/extract/test_decorators.py | 3 ++- tests/pipeline/test_resources_evaluation.py | 26 +++++++++++++++++++++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py index 9c17195c65..7d5bea5e28 100644 --- a/dlt/extract/utils.py +++ b/dlt/extract/utils.py @@ -187,9 +187,10 @@ def _wrapper(*args: Any, **kwargs: Any) -> Generator[TDataItems, None, None]: is_generator = True gen: Union[Generator[TDataItems, Optional[Any], Optional[Any]], AnyFun] if callable(f): - if inspect.isgeneratorfunction(f): + if inspect.isgeneratorfunction(inspect.unwrap(f)): gen = f(*args, **kwargs) else: + # Function is a transformer that returns values is_generator = False gen = f else: @@ -202,7 +203,10 @@ def _parallel_gen() -> TDataItems: nonlocal busy nonlocal exhausted try: - return next(gen) # type: ignore[arg-type] + if is_generator: + return next(gen) # type: ignore[arg-type] + else: + return gen(*args, **kwargs) # type: ignore[operator] except StopIteration: exhausted = True return None @@ -214,11 +218,9 @@ def _parallel_gen() -> TDataItems: while busy: yield None busy = True - if is_generator: - yield _parallel_gen - else: + if not is_generator: # Regular function is only resolved once exhausted = True - yield partial(gen, *args, **kwargs) # type: ignore[arg-type] + yield _parallel_gen except GeneratorExit: if is_generator: gen.close() # type: ignore[union-attr] diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 364d2a4294..d5151670b5 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -868,7 +868,8 @@ def some_gen(): resource = dlt.resource(some_gen, parallelized=True) # Generator func is wrapped with parallelized gen that yields callables - result = next(resource._pipe.gen()) # type: ignore + gen = resource._pipe.gen() # type: ignore + result = next(gen) # type: ignore[arg-type] assert result() == 1 # Same but wrapping generator directly diff --git a/tests/pipeline/test_resources_evaluation.py b/tests/pipeline/test_resources_evaluation.py index d57ae327c4..fd6879ca39 100644 --- a/tests/pipeline/test_resources_evaluation.py +++ b/tests/pipeline/test_resources_evaluation.py @@ -386,3 +386,29 @@ def some_source(): assert sorted(result) == expected_result assert exec_order == ["+", "-"] * item_count + + +def test_parallelized_bare_generator() -> None: + def pos_data(): + for i in range(1, 6): + time.sleep(0.1) + yield i + + def neg_data(): + time.sleep(0.05) + for i in range(-1, -6, -1): + time.sleep(0.1) + yield i + + @dlt.source + def some_source(): + return [ + # Resources created from generators directly (not generator functions) can be parallelized + dlt.resource(pos_data(), parallelized=True, name="pos_data"), + dlt.resource(neg_data(), parallelized=True, name="neg_data"), + ] + + result = list(some_source()) + + assert set(result) == {1, 2, 3, 4, 5, -1, -2, -3, -4, -5} + assert len(result) == 10 From 6fbd8592b584ac1df1b72972ef02cf841646dfdf Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 20 Feb 2024 17:36:16 -0500 Subject: [PATCH 15/27] Source parallelize skip invalid resources, docstring --- dlt/extract/resource.py | 4 +++- dlt/extract/source.py | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index d99dba7db3..d59cb21cf2 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -344,8 +344,10 @@ def _gen_wrap(gen: TPipeStep) -> TPipeStep: return self def parallelize(self) -> "DltResource": - """Wraps the resource to execute each item in a threadpool to allow multiple resources to extract in parallel.""" + """Wraps the resource to execute each item in a threadpool to allow multiple resources to extract in parallel. + The resource must be a generator or generator function or a transformer function. + """ if ( not inspect.isgenerator(self._pipe.gen) and not inspect.isgeneratorfunction(self._pipe.gen) diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 30369512bc..53d7d6fa8c 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -32,6 +32,7 @@ DataItemRequiredForDynamicTableHints, ResourcesNotFoundError, DeletingResourcesNotSupported, + InvalidParallelResourceDataType, ) @@ -335,8 +336,15 @@ def add_limit(self, max_items: int) -> "DltSource": # noqa: A003 return self def parallelize(self) -> "DltSource": + """Mark all resources in the source to run in parallel. + + Only transformers and resources based on generators and generator functions are supported, unsupported resources will be skipped. + """ for resource in self.resources.selected.values(): - resource.parallelize() + try: + resource.parallelize() + except InvalidParallelResourceDataType: + pass return self @property From 549d5fae14274d4398328ff8c767214b57ad24ad Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 22 Feb 2024 15:12:27 -0500 Subject: [PATCH 16/27] Handle wrapped generator function --- dlt/extract/resource.py | 2 +- tests/pipeline/test_resources_evaluation.py | 42 +++++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index d59cb21cf2..27c0f7c2b4 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -350,7 +350,7 @@ def parallelize(self) -> "DltResource": """ if ( not inspect.isgenerator(self._pipe.gen) - and not inspect.isgeneratorfunction(self._pipe.gen) + and not inspect.isgeneratorfunction(inspect.unwrap(self._pipe.gen)) and not (callable(self._pipe.gen) and self.is_transformer) ): raise InvalidParallelResourceDataType(self.name, self._pipe.gen, type(self._pipe.gen)) diff --git a/tests/pipeline/test_resources_evaluation.py b/tests/pipeline/test_resources_evaluation.py index fd6879ca39..335c703145 100644 --- a/tests/pipeline/test_resources_evaluation.py +++ b/tests/pipeline/test_resources_evaluation.py @@ -389,15 +389,19 @@ def some_source(): def test_parallelized_bare_generator() -> None: + main_thread = threading.get_ident() + threads = set() + def pos_data(): for i in range(1, 6): - time.sleep(0.1) + threads.add(threading.get_ident()) + time.sleep(0.01) yield i def neg_data(): - time.sleep(0.05) for i in range(-1, -6, -1): - time.sleep(0.1) + threads.add(threading.get_ident()) + time.sleep(0.01) yield i @dlt.source @@ -410,5 +414,37 @@ def some_source(): result = list(some_source()) + assert threads and main_thread not in threads assert set(result) == {1, 2, 3, 4, 5, -1, -2, -3, -4, -5} assert len(result) == 10 + + +def test_parallelized_wrapped_generator() -> None: + threads = set() + + def some_data(): + for i in range(1, 6): + time.sleep(0.01) + threads.add(threading.get_ident()) + yield i + + def some_data2(): + for i in range(-1, -6, -1): + time.sleep(0.01) + threads.add(threading.get_ident()) + yield i + + @dlt.source + def some_source(): + # Bound resources result in a wrapped generator function, + return [ + dlt.resource(some_data, parallelized=True, name="some_data")(), + dlt.resource(some_data2, parallelized=True, name="some_data2")(), + ] + + source = some_source() + + result = list(source) + + assert len(threads) > 1 + assert set(result) == {1, 2, 3, 4, 5, -1, -2, -3, -4, -5} From 7a0d8ae1258432c0eefd36bfaf8d5a58b6667ec7 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 22 Feb 2024 16:22:40 -0500 Subject: [PATCH 17/27] Don't test exec order, only check that multiple threads are used --- tests/pipeline/test_resources_evaluation.py | 66 ++++++++++++--------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/tests/pipeline/test_resources_evaluation.py b/tests/pipeline/test_resources_evaluation.py index 335c703145..018eb1e666 100644 --- a/tests/pipeline/test_resources_evaluation.py +++ b/tests/pipeline/test_resources_evaluation.py @@ -2,6 +2,7 @@ import time import threading import random +from itertools import product import dlt, asyncio, pytest, os @@ -224,18 +225,15 @@ def test_parallelized_resource(parallelized: bool) -> None: @dlt.resource(parallelized=parallelized) def resource1(): for l_ in ["a", "b", "c"]: - time.sleep(0.5) - nonlocal execution_order + time.sleep(0.01) execution_order.append("one") threads.add(threading.get_ident()) yield {"letter": l_} @dlt.resource(parallelized=parallelized) def resource2(): - time.sleep(0.25) for l_ in ["e", "f", "g"]: - time.sleep(0.5) - nonlocal execution_order + time.sleep(0.01) execution_order.append("two") threads.add(threading.get_ident()) yield {"letter": l_} @@ -260,11 +258,12 @@ def source(): assert {r[0] for r in rows} == {"e", "f", "g"} if parallelized: - assert len(threads) > 1 - assert execution_order == ["one", "two", "one", "two", "one", "two"] + assert ( + len(threads) > 1 and threading.get_ident() not in threads + ) # Nothing runs in main thread else: assert execution_order == ["one", "one", "one", "two", "two", "two"] - assert len(threads) == 1 + assert threads == {threading.get_ident()} # Everything runs in main thread # Parametrize with different resource counts to excersize the worker pool: @@ -273,9 +272,13 @@ def source(): # 3. Exact number of workers # 4. More than future pool max size # 5. Exact future pool max size -@pytest.mark.parametrize("n_resources", [8, 1, 5, 25, 20]) -def test_parallelized_resource_extract_order(n_resources: int) -> None: - os.environ["EXTRACT__NEXT_ITEM_MODE"] = "fifo" +@pytest.mark.parametrize( + "n_resources,next_item_mode", product([8, 1, 5, 25, 20], ["fifo", "round_robin"]) +) +def test_parallelized_resource_extract_order(n_resources: int, next_item_mode: str) -> None: + os.environ["EXTRACT__NEXT_ITEM_MODE"] = next_item_mode + + threads = set() item_counts = [random.randrange(10, 30) for _ in range(n_resources)] item_ranges = [] # Create numeric ranges that each resource will yield @@ -288,14 +291,11 @@ def test_parallelized_resource_extract_order(n_resources: int) -> None: end_range = start_range + n_items item_ranges.append(range(start_range, end_range)) - def _sleep_duration() -> float: - # Sleep for random duration each yield - return random.uniform(0.005, 0.012) - @dlt.source def some_source(): def some_data(resource_num: int): for item in item_ranges[resource_num]: + threads.add(threading.get_ident()) print(f"RESOURCE {resource_num}") # Sleep for a random duration each yield time.sleep(random.uniform(0.005, 0.012)) @@ -324,27 +324,32 @@ def some_data(resource_num: int): # Items are extracted in order per resource assert chunk == sorted(chunk) + assert len(threads) >= min(2, n_resources) and threading.get_ident() not in threads + -def test_test_parallelized_transformers() -> None: +def test_test_parallelized_resource_transformers() -> None: item_count = 6 + threads = set() + transformer_threads = set() @dlt.resource(parallelized=True) def pos_data(): for i in range(1, item_count + 1): + threads.add(threading.get_ident()) time.sleep(0.1) yield i @dlt.resource(parallelized=True) def neg_data(): - time.sleep(0.05) for i in range(-1, -item_count - 1, -1): + threads.add(threading.get_ident()) time.sleep(0.1) yield i @dlt.transformer(parallelized=True) def multiply(item): + transformer_threads.add(threading.get_ident()) time.sleep(0.05) - exec_order.append("+" if item > 0 else "-") yield item * 10 @dlt.source @@ -354,20 +359,24 @@ def some_source(): pos_data | multiply.with_name("t_b"), ] - exec_order: List[str] = [] result = list(some_source()) expected_result = [i * 10 for i in range(-item_count, item_count + 1)] expected_result.remove(0) assert sorted(result) == expected_result - assert exec_order == ["+", "-"] * item_count + # Nothing runs in main thread + assert threads and threading.get_ident() not in threads + assert transformer_threads and threading.get_ident() not in transformer_threads + + threads = set() + transformer_threads = set() @dlt.transformer(parallelized=True) # type: ignore[no-redef] def multiply(item): # Transformer that is not a generator + transformer_threads.add(threading.get_ident()) time.sleep(0.05) - exec_order.append("+" if item > 0 else "-") return item * 10 @dlt.source # type: ignore[no-redef] @@ -377,18 +386,19 @@ def some_source(): pos_data | multiply.with_name("t_b"), ] - exec_order = [] - result = list(some_source()) expected_result = [i * 10 for i in range(-item_count, item_count + 1)] expected_result.remove(0) assert sorted(result) == expected_result - assert exec_order == ["+", "-"] * item_count + + # Nothing runs in main thread + assert len(threads) > 1 and threading.get_ident() not in threads + assert len(transformer_threads) > 1 and threading.get_ident() not in transformer_threads -def test_parallelized_bare_generator() -> None: +def test_parallelized_resource_bare_generator() -> None: main_thread = threading.get_ident() threads = set() @@ -414,12 +424,12 @@ def some_source(): result = list(some_source()) - assert threads and main_thread not in threads + assert len(threads) > 1 and main_thread not in threads assert set(result) == {1, 2, 3, 4, 5, -1, -2, -3, -4, -5} assert len(result) == 10 -def test_parallelized_wrapped_generator() -> None: +def test_parallelized_resource_wrapped_generator() -> None: threads = set() def some_data(): @@ -446,5 +456,5 @@ def some_source(): result = list(source) - assert len(threads) > 1 + assert len(threads) > 1 and threading.get_ident() not in threads assert set(result) == {1, 2, 3, 4, 5, -1, -2, -3, -4, -5} From ec5a965d659bad8c9e178225850ad6d96d68e7a4 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 22 Feb 2024 17:49:23 -0500 Subject: [PATCH 18/27] Poll futures with timeout, no check sources_count, test gen.close() --- dlt/extract/concurrency.py | 23 +++++++++++---- dlt/extract/pipe_iterator.py | 25 ++++++++++------ tests/extract/test_decorators.py | 12 ++++++++ tests/pipeline/test_resources_evaluation.py | 32 +++++++++++++++++++++ 4 files changed, 78 insertions(+), 14 deletions(-) diff --git a/dlt/extract/concurrency.py b/dlt/extract/concurrency.py index e431d2c36a..ca04106a93 100644 --- a/dlt/extract/concurrency.py +++ b/dlt/extract/concurrency.py @@ -7,9 +7,10 @@ as_completed, wait as wait_for_futures, FIRST_COMPLETED, + TimeoutError as FutureTimeoutError, ) from threading import Thread -from typing import List, Awaitable, Callable, Any, Dict, Set, Optional, overload, Literal +from typing import List, Awaitable, Callable, Any, Dict, Set, Optional, overload, Literal, Union from dlt.common.exceptions import PipelineException from dlt.common.configuration.container import Container @@ -171,15 +172,27 @@ def _next_done_future(self) -> Optional[TItemFuture]: """Get the done future in the pool (if any). This does not block.""" return next((fut for fut in self.futures if fut.done() and not fut.cancelled()), None) - def resolve_next_future(self) -> Optional[ResolvablePipeItem]: - """Block until the next future is done and return the result. Returns None if no futures done.""" + def resolve_next_future( + self, use_configured_timeout: bool = False + ) -> Optional[ResolvablePipeItem]: + """Block until the next future is done and return the result. Returns None if no futures done. + + Args: + use_configured_timeout: If True, use the value of `self.poll_interval` as the max wait time, + raises `concurrent.futures.TimeoutError` if no future is done within that time. + + Returns: + The resolved future item or None if no future is done. + """ if not self.futures: return None if (future := self._next_done_future()) is not None: # When there are multiple already done futures from the same pipe we return results in insertion order return self._resolve_future(future) - for future in as_completed(self.futures): + for future in as_completed( + self.futures, timeout=self.poll_interval if use_configured_timeout else None + ): if future.cancelled(): # Get the next not-cancelled future continue @@ -193,7 +206,7 @@ def resolve_next_future_no_wait(self) -> Optional[ResolvablePipeItem]: This does not block and returns None if no future is done. """ # Get next done future - future = next((fut for fut in self.futures if fut.done() and not fut.cancelled()), None) + future = self._next_done_future() if not future: return None diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index 4f47f0e7d0..650872dad6 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -35,7 +35,7 @@ from dlt.extract.pipe import Pipe from dlt.extract.typing import DataItemWithMeta, TItemFuture from dlt.extract.utils import wrap_async_iterator -from dlt.extract.concurrency import FuturesPool +from dlt.extract.concurrency import FuturesPool, FutureTimeoutError from dlt.extract.items import PipeItem, ResolvablePipeItem, SourcePipeItem @@ -153,8 +153,13 @@ def __next__(self) -> PipeItem: pipe_item = self._get_source_item() if pipe_item is None: - # Block until a future is resolved - pipe_item = self._futures_pool.resolve_next_future() + # Wait for some time for futures to resolve + try: + pipe_item = self._futures_pool.resolve_next_future( + use_configured_timeout=True + ) + except FutureTimeoutError: + pass if pipe_item is None: if self._futures_pool.empty and len(self._sources) == 0: @@ -250,6 +255,9 @@ def _get_source_item(self) -> ResolvablePipeItem: else: self._current_source_index = (self._current_source_index - 1) % sources_count while True: + # if we have checked all sources once and all returned None, then we can sleep a bit + if self._current_source_index == first_evaluated_index: + return None # get next item from the current source gen, step, pipe, meta = self._sources[self._current_source_index] set_current_pipe_name(pipe.name) @@ -272,10 +280,6 @@ def _get_source_item(self) -> ResolvablePipeItem: # Send callables to the worker pool right away, collect futures from multiple sources in one iteration self._futures_pool.submit(pipe_item, block=True) pipe_item = None - if len(self._futures_pool) >= sources_count: - # Return here so we're not collecting done futures forever - return None - # Otherwhise we continue to the next source if pipe_item is not None: return pipe_item @@ -305,13 +309,16 @@ def close(self) -> None: def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: loop.stop() + # Close the futures pool and cancel all tasks + # It's important to do this before closing generators as we can't close a running generator + self._futures_pool.close() + # close all generators for gen, _, _, _ in self._sources: if inspect.isgenerator(gen): gen.close() - self._sources.clear() - self._futures_pool.close() + self._sources.clear() def __enter__(self) -> "PipeIterator": return self diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index d5151670b5..5cba25aef4 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -903,3 +903,15 @@ async def some_data(): # From list with pytest.raises(InvalidParallelResourceDataType): dlt.resource([1, 2, 3], name="T", parallelized=True) + + # Test that inner generator is closed when wrapper is closed + gen_orig = some_gen() + resource = dlt.resource(gen_orig, parallelized=True) + gen = resource._pipe.gen + + next(gen) # type: ignore + gen.close() # type: ignore + + with pytest.raises(StopIteration): + # Inner generator is also closed + next(gen_orig) diff --git a/tests/pipeline/test_resources_evaluation.py b/tests/pipeline/test_resources_evaluation.py index 018eb1e666..5a85c06462 100644 --- a/tests/pipeline/test_resources_evaluation.py +++ b/tests/pipeline/test_resources_evaluation.py @@ -5,6 +5,7 @@ from itertools import product import dlt, asyncio, pytest, os +from dlt.extract.exceptions import ResourceExtractionError def test_async_iterator_resource() -> None: @@ -458,3 +459,34 @@ def some_source(): assert len(threads) > 1 and threading.get_ident() not in threads assert set(result) == {1, 2, 3, 4, 5, -1, -2, -3, -4, -5} + + +def test_parallelized_resource_exception_pool_is_closed() -> None: + """Checking that futures pool is closed before generators are closed when a parallel resource raises. + For now just checking that we don't get any "generator is already closed" errors, as would happen + when futures aren't cancelled before closing generators. + """ + + def some_data(): + for i in range(1, 6): + time.sleep(0.1) + yield i + + def some_data2(): + for i in range(1, 6): + time.sleep(0.005) + yield i + if i == 3: + raise RuntimeError("we have failed") + + @dlt.source + def some_source(): + yield dlt.resource(some_data, parallelized=True, name="some_data") + yield dlt.resource(some_data2, parallelized=True, name="some_data2") + + source = some_source() + + with pytest.raises(ResourceExtractionError) as einfo: + list(source) + + assert "we have failed" in str(einfo.value) From 38f7069844fe93300d49c19f40c5cc29a83860bc Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 22 Feb 2024 18:37:02 -0500 Subject: [PATCH 19/27] Always block when submitting futures, remove redundant submit future --- dlt/extract/concurrency.py | 28 ++++++++-------------------- dlt/extract/pipe_iterator.py | 17 +++++------------ 2 files changed, 13 insertions(+), 32 deletions(-) diff --git a/dlt/extract/concurrency.py b/dlt/extract/concurrency.py index ca04106a93..84157e801b 100644 --- a/dlt/extract/concurrency.py +++ b/dlt/extract/concurrency.py @@ -10,7 +10,7 @@ TimeoutError as FutureTimeoutError, ) from threading import Thread -from typing import List, Awaitable, Callable, Any, Dict, Set, Optional, overload, Literal, Union +from typing import List, Awaitable, Callable, Any, Dict, Set, Optional, Literal, Union from dlt.common.exceptions import PipelineException from dlt.common.configuration.container import Container @@ -91,36 +91,24 @@ def _vacate_slot(self, _: TItemFuture) -> None: # Used as callback to free up slot when future is done self.used_slots -= 1 - @overload - def submit( - self, pipe_item: ResolvablePipeItem, block: Literal[False] - ) -> Optional[TItemFuture]: ... - - @overload - def submit(self, pipe_item: ResolvablePipeItem, block: Literal[True]) -> TItemFuture: ... - - def submit(self, pipe_item: ResolvablePipeItem, block: bool = False) -> Optional[TItemFuture]: + def submit(self, pipe_item: ResolvablePipeItem) -> TItemFuture: """Submit an item to the pool. Args: - pipe_item: The item to submit. - block: If True, block until there's a free slot in the pool. + pipe_item: The pipe item to submit. `pipe_item.item` must be either an asyncio coroutine or a callable. Returns: - The future if the item was successfully submitted, otherwise None. + The resulting future object """ # Sanity check, negative free slots means there's a bug somewhere assert self.free_slots >= 0, "Worker pool has negative free slots, this should never happen" if self.free_slots == 0: - if block: - # Wait until some future is completed to ensure there's a free slot - # Note: This is probably not thread safe. If ever multiple threads will be submitting - # jobs to the pool, we ned to change this whole method to be inside a `threading.Lock` - self._wait_for_free_slot() - else: - return None + # Wait until some future is completed to ensure there's a free slot + # Note: This is probably not thread safe. If ever multiple threads will be submitting + # jobs to the pool, we ned to change this whole method to be inside a `threading.Lock` + self._wait_for_free_slot() future: Optional[TItemFuture] = None diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index 650872dad6..27c341c4de 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -189,8 +189,8 @@ def __next__(self) -> PipeItem: continue if isinstance(item, Awaitable) or callable(item): - # Callables that come from following pipe steps need to be added to the pool - self._futures_pool.submit(pipe_item, block=True) # type: ignore[call-overload] + # Callables are submitted to the pool to be executed in the background + self._futures_pool.submit(pipe_item) # type: ignore[arg-type] pipe_item = None # Future will be resolved later, move on to the next item continue @@ -255,7 +255,7 @@ def _get_source_item(self) -> ResolvablePipeItem: else: self._current_source_index = (self._current_source_index - 1) % sources_count while True: - # if we have checked all sources once and all returned None, then we can sleep a bit + # if we have checked all sources once and all returned None, return and poll/resolve some futures if self._current_source_index == first_evaluated_index: return None # get next item from the current source @@ -270,16 +270,9 @@ def _get_source_item(self) -> ResolvablePipeItem: if not isinstance(pipe_item, ResolvablePipeItem): # keep the item assigned step and pipe when creating resolvable item if isinstance(pipe_item, DataItemWithMeta): - pipe_item = ResolvablePipeItem( - pipe_item.data, step, pipe, pipe_item.meta - ) + return ResolvablePipeItem(pipe_item.data, step, pipe, pipe_item.meta) else: - pipe_item = ResolvablePipeItem(pipe_item, step, pipe, meta) - - if isinstance(pipe_item.item, Awaitable) or callable(pipe_item.item): - # Send callables to the worker pool right away, collect futures from multiple sources in one iteration - self._futures_pool.submit(pipe_item, block=True) - pipe_item = None + return ResolvablePipeItem(pipe_item, step, pipe, meta) if pipe_item is not None: return pipe_item From 96dcfe30536390819f2961a01967b5b9fa3f358f Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Fri, 23 Feb 2024 12:29:36 -0500 Subject: [PATCH 20/27] Revert limit async gen --- dlt/extract/resource.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 27c0f7c2b4..5fb5b795b3 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -316,12 +316,14 @@ def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 def _gen_wrap(gen: TPipeStep) -> TPipeStep: """Wrap a generator to take the first `max_items` records""" count = 0 + is_async_gen = False if inspect.isfunction(gen): gen = gen() # wrap async gen already here if isinstance(gen, AsyncIterator): gen = wrap_async_iterator(gen) + is_async_gen = True try: for i in gen: # type: ignore # TODO: help me fix this later @@ -331,7 +333,7 @@ def _gen_wrap(gen: TPipeStep) -> TPipeStep: # async gen yields awaitable so we must count one awaitable more # so the previous one is evaluated and yielded. # new awaitable will be cancelled - if count == max_items: + if count == max_items + int(is_async_gen): return finally: if inspect.isgenerator(gen): From b7acf93d21863e9264b6045d0d11994bfb95a8ad Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Fri, 23 Feb 2024 13:09:53 -0500 Subject: [PATCH 21/27] Always check done futures --- dlt/extract/pipe_iterator.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index 27c341c4de..feff91e27b 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -149,8 +149,12 @@ def __next__(self) -> PipeItem: while True: # do we need new item? if pipe_item is None: - # if none then take element from the newest source - pipe_item = self._get_source_item() + # Always check for done futures to avoid starving the pool + pipe_item = self._futures_pool.resolve_next_future_no_wait() + + if pipe_item is None: + # if none then take element from the newest source + pipe_item = self._get_source_item() if pipe_item is None: # Wait for some time for futures to resolve @@ -299,9 +303,6 @@ def close(self) -> None: # unregister the pipe name right after execution of gen stopped unset_current_pipe_name() - def stop_background_loop(loop: asyncio.AbstractEventLoop) -> None: - loop.stop() - # Close the futures pool and cancel all tasks # It's important to do this before closing generators as we can't close a running generator self._futures_pool.close() From 0882987426859ed16d45cd71daf705533cd32495 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Fri, 23 Feb 2024 14:46:43 -0500 Subject: [PATCH 22/27] Fix type error --- dlt/extract/resource.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 5fb5b795b3..28a071729e 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -352,7 +352,10 @@ def parallelize(self) -> "DltResource": """ if ( not inspect.isgenerator(self._pipe.gen) - and not inspect.isgeneratorfunction(inspect.unwrap(self._pipe.gen)) + and not ( + callable(self._pipe.gen) + and inspect.isgeneratorfunction(inspect.unwrap(self._pipe.gen)) + ) and not (callable(self._pipe.gen) and self.is_transformer) ): raise InvalidParallelResourceDataType(self.name, self._pipe.gen, type(self._pipe.gen)) From acb3b45863d091a79fed6dde42ebe937d3b7aba8 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Fri, 23 Feb 2024 15:13:10 -0500 Subject: [PATCH 23/27] Typing --- dlt/common/typing.py | 5 ++++- dlt/extract/resource.py | 2 +- dlt/extract/utils.py | 14 ++++++-------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 0682be5062..7c20b0df43 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -25,6 +25,7 @@ runtime_checkable, IO, Iterator, + Generator, ) from typing_extensions import ( @@ -70,7 +71,9 @@ AnyFun: TypeAlias = Callable[..., Any] TFun = TypeVar("TFun", bound=AnyFun) # any function TAny = TypeVar("TAny", bound=Any) -TAnyFunOrIterator = TypeVar("TAnyFunOrIterator", AnyFun, Iterator[Any]) +TAnyFunOrGenerator = TypeVar( + "TAnyFunOrGenerator", AnyFun, Generator[Any, Optional[Any], Optional[Any]] +) TAnyClass = TypeVar("TAnyClass", bound=object) TimedeltaSeconds = Union[int, float, timedelta] # represent secret value ie. coming from Kubernetes/Docker secrets or other providers diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 28a071729e..f73ee676b6 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -360,7 +360,7 @@ def parallelize(self) -> "DltResource": ): raise InvalidParallelResourceDataType(self.name, self._pipe.gen, type(self._pipe.gen)) - self._pipe.replace_gen(wrap_parallel_iterator(self._pipe.gen)) + self._pipe.replace_gen(wrap_parallel_iterator(self._pipe.gen)) # type: ignore # TODO return self def add_step( diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py index 7d5bea5e28..f9a35cd9ae 100644 --- a/dlt/extract/utils.py +++ b/dlt/extract/utils.py @@ -21,7 +21,7 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.pipeline import reset_resource_state from dlt.common.schema.typing import TColumnNames, TAnySchemaColumns, TTableSchemaColumns -from dlt.common.typing import AnyFun, DictStrAny, TDataItem, TDataItems +from dlt.common.typing import AnyFun, DictStrAny, TDataItem, TDataItems, TAnyFunOrGenerator from dlt.common.utils import get_callable_name from dlt.extract.exceptions import ( @@ -178,14 +178,12 @@ async def run() -> TDataItems: exhausted = True -def wrap_parallel_iterator( - f: Union[Generator[TDataItems, Optional[Any], Optional[Any]], AnyFun] -) -> Union[Generator[TDataItems, Optional[Any], Optional[Any]], AnyFun]: +def wrap_parallel_iterator(f: TAnyFunOrGenerator) -> TAnyFunOrGenerator: """Wraps a generator for parallel extraction""" def _wrapper(*args: Any, **kwargs: Any) -> Generator[TDataItems, None, None]: is_generator = True - gen: Union[Generator[TDataItems, Optional[Any], Optional[Any]], AnyFun] + gen: TAnyFunOrGenerator if callable(f): if inspect.isgeneratorfunction(inspect.unwrap(f)): gen = f(*args, **kwargs) @@ -204,7 +202,7 @@ def _parallel_gen() -> TDataItems: nonlocal exhausted try: if is_generator: - return next(gen) # type: ignore[arg-type] + return next(gen) # type: ignore[call-overload] else: return gen(*args, **kwargs) # type: ignore[operator] except StopIteration: @@ -223,11 +221,11 @@ def _parallel_gen() -> TDataItems: yield _parallel_gen except GeneratorExit: if is_generator: - gen.close() # type: ignore[union-attr] + gen.close() # type: ignore[attr-defined] raise if callable(f): - return wraps(f)(_wrapper) + return wraps(f)(_wrapper) # type: ignore[arg-type] return _wrapper() From 6af5c1a931503db96fe37fe29acd5c05ec5a8f05 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Tue, 27 Feb 2024 22:07:34 +0100 Subject: [PATCH 24/27] adds additional sleep when futures pool is empty --- dlt/extract/concurrency.py | 9 ++------- dlt/extract/pipe_iterator.py | 11 +++++++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/dlt/extract/concurrency.py b/dlt/extract/concurrency.py index 84157e801b..79c518b697 100644 --- a/dlt/extract/concurrency.py +++ b/dlt/extract/concurrency.py @@ -1,16 +1,11 @@ import asyncio -from uuid import uuid4 -from asyncio import Future -from contextlib import contextmanager from concurrent.futures import ( ThreadPoolExecutor, as_completed, wait as wait_for_futures, - FIRST_COMPLETED, - TimeoutError as FutureTimeoutError, ) from threading import Thread -from typing import List, Awaitable, Callable, Any, Dict, Set, Optional, Literal, Union +from typing import Awaitable, Dict, Optional from dlt.common.exceptions import PipelineException from dlt.common.configuration.container import Container @@ -131,7 +126,7 @@ def submit(self, pipe_item: ResolvablePipeItem) -> TItemFuture: ) return future - def poll(self) -> None: + def sleep(self) -> None: sleep(self.poll_interval) def _resolve_future(self, future: TItemFuture) -> Optional[ResolvablePipeItem]: diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index feff91e27b..b23a9ae7e4 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -1,10 +1,8 @@ import inspect import types -import asyncio from typing import ( AsyncIterator, Dict, - Optional, Sequence, Union, Iterator, @@ -14,6 +12,7 @@ Type, Literal, ) +from concurrent.futures import TimeoutError as FutureTimeoutError from dlt.common.configuration import configspec from dlt.common.configuration.inject import with_config @@ -33,9 +32,9 @@ ResourceExtractionError, ) from dlt.extract.pipe import Pipe -from dlt.extract.typing import DataItemWithMeta, TItemFuture +from dlt.extract.typing import DataItemWithMeta from dlt.extract.utils import wrap_async_iterator -from dlt.extract.concurrency import FuturesPool, FutureTimeoutError +from dlt.extract.concurrency import FuturesPool from dlt.extract.items import PipeItem, ResolvablePipeItem, SourcePipeItem @@ -164,6 +163,10 @@ def __next__(self) -> PipeItem: ) except FutureTimeoutError: pass + else: + if pipe_item is None: + # pool was empty - then do a regular poll sleep + self._futures_pool.sleep() if pipe_item is None: if self._futures_pool.empty and len(self._sources) == 0: From 19e09cb9f2508ea81c5d395c126d7bcc8985fca1 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Wed, 28 Feb 2024 19:49:24 -0500 Subject: [PATCH 25/27] Update docs and snippets --- docs/examples/chess_production/chess.py | 19 ++-- .../chess_production/code/chess-snippets.py | 7 +- .../docs/examples/chess_production/index.md | 11 +- .../transformers/code/pokemon-snippets.py | 15 ++- .../docs/examples/transformers/index.md | 8 +- docs/website/docs/general-usage/resource.md | 27 +++++ docs/website/docs/reference/performance.md | 105 +++++++++++------- .../performance-snippets.py | 76 +++++++------ 8 files changed, 164 insertions(+), 104 deletions(-) diff --git a/docs/examples/chess_production/chess.py b/docs/examples/chess_production/chess.py index 2e85805781..e2d0b9c10d 100644 --- a/docs/examples/chess_production/chess.py +++ b/docs/examples/chess_production/chess.py @@ -6,6 +6,7 @@ from dlt.common.typing import StrAny, TDataItems from dlt.sources.helpers.requests import client + @dlt.source def chess( chess_url: str = dlt.config.value, @@ -25,11 +26,8 @@ def players() -> Iterator[TDataItems]: yield from _get_data_with_retry(f"titled/{title}")["players"][:max_players] # this resource takes data from players and returns profiles - # it uses `defer` decorator to enable parallel run in thread pool. - # defer requires return at the end so we convert yield into return (we return one item anyway) - # you can still have yielding transformers, look for the test named `test_evolve_schema` - @dlt.transformer(data_from=players, write_disposition="replace") - @dlt.defer + # it uses `paralellized` flag to enable parallel run in thread pool. + @dlt.transformer(data_from=players, write_disposition="replace", parallelized=True) def players_profiles(username: Any) -> TDataItems: print(f"getting {username} profile via thread {threading.current_thread().name}") sleep(1) # add some latency to show parallel runs @@ -59,6 +57,7 @@ def players_games(username: Any) -> Iterator[TDataItems]: MAX_PLAYERS = 5 + def load_data_with_retry(pipeline, data): try: for attempt in Retrying( @@ -68,9 +67,7 @@ def load_data_with_retry(pipeline, data): reraise=True, ): with attempt: - logger.info( - f"Running the pipeline, attempt={attempt.retry_state.attempt_number}" - ) + logger.info(f"Running the pipeline, attempt={attempt.retry_state.attempt_number}") load_info = pipeline.run(data) logger.info(str(load_info)) @@ -92,9 +89,7 @@ def load_data_with_retry(pipeline, data): # print the information on the first load package and all jobs inside logger.info(f"First load package info: {load_info.load_packages[0]}") # print the information on the first completed job in first load package - logger.info( - f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}" - ) + logger.info(f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}") # check for schema updates: schema_updates = [p.schema_update for p in load_info.load_packages] @@ -152,4 +147,4 @@ def load_data_with_retry(pipeline, data): ) # get data for a few famous players data = chess(chess_url="https://api.chess.com/pub/", max_players=MAX_PLAYERS) - load_data_with_retry(pipeline, data) \ No newline at end of file + load_data_with_retry(pipeline, data) diff --git a/docs/website/docs/examples/chess_production/code/chess-snippets.py b/docs/website/docs/examples/chess_production/code/chess-snippets.py index b22dc3693b..39ddf14836 100644 --- a/docs/website/docs/examples/chess_production/code/chess-snippets.py +++ b/docs/website/docs/examples/chess_production/code/chess-snippets.py @@ -32,11 +32,8 @@ def players() -> Iterator[TDataItems]: yield from _get_data_with_retry(f"titled/{title}")["players"][:max_players] # this resource takes data from players and returns profiles - # it uses `defer` decorator to enable parallel run in thread pool. - # defer requires return at the end so we convert yield into return (we return one item anyway) - # you can still have yielding transformers, look for the test named `test_evolve_schema` - @dlt.transformer(data_from=players, write_disposition="replace") - @dlt.defer + # it uses `paralellized` flag to enable parallel run in thread pool. + @dlt.transformer(data_from=players, write_disposition="replace", parallelized=True) def players_profiles(username: Any) -> TDataItems: print(f"getting {username} profile via thread {threading.current_thread().name}") sleep(1) # add some latency to show parallel runs diff --git a/docs/website/docs/examples/chess_production/index.md b/docs/website/docs/examples/chess_production/index.md index f372d26d80..d0f87df7d2 100644 --- a/docs/website/docs/examples/chess_production/index.md +++ b/docs/website/docs/examples/chess_production/index.md @@ -27,7 +27,7 @@ We'll learn how to: ### Init chess source -```py +```python import threading from typing import Any, Iterator @@ -55,11 +55,8 @@ def chess( yield from _get_data_with_retry(f"titled/{title}")["players"][:max_players] # this resource takes data from players and returns profiles - # it uses `defer` decorator to enable parallel run in thread pool. - # defer requires return at the end so we convert yield into return (we return one item anyway) - # you can still have yielding transformers, look for the test named `test_evolve_schema` - @dlt.transformer(data_from=players, write_disposition="replace") - @dlt.defer + # it uses `paralellized` flag to enable parallel run in thread pool. + @dlt.transformer(data_from=players, write_disposition="replace", parallelized=True) def players_profiles(username: Any) -> TDataItems: print(f"getting {username} profile via thread {threading.current_thread().name}") sleep(1) # add some latency to show parallel runs @@ -195,4 +192,4 @@ if __name__ == "__main__": data = chess(chess_url="https://api.chess.com/pub/", max_players=MAX_PLAYERS) load_data_with_retry(pipeline, data) ``` - \ No newline at end of file + diff --git a/docs/website/docs/examples/transformers/code/pokemon-snippets.py b/docs/website/docs/examples/transformers/code/pokemon-snippets.py index d8fe4f41ba..ff8757b94e 100644 --- a/docs/website/docs/examples/transformers/code/pokemon-snippets.py +++ b/docs/website/docs/examples/transformers/code/pokemon-snippets.py @@ -30,15 +30,13 @@ def _get_pokemon(_pokemon): # a special case where just one item is retrieved in transformer # a whole transformer may be marked for parallel execution - @dlt.transformer - @dlt.defer + @dlt.transformer(parallelized=True) def species(pokemon_details): """Yields species details for a pokemon""" species_data = requests.get(pokemon_details["species"]["url"]).json() # link back to pokemon so we have a relation in loaded data species_data["pokemon_id"] = pokemon_details["id"] - # just return the results, if you yield, - # generator will be evaluated in main thread + # You can return the result instead of yield since the transformer only generates one result return species_data # create two simple pipelines with | operator @@ -48,7 +46,6 @@ def species(pokemon_details): return (pokemon_list | pokemon, pokemon_list | pokemon | species) - __name__ = "__main__" # @@@DLT_REMOVE if __name__ == "__main__": # build duck db pipeline pipeline = dlt.pipeline( @@ -60,6 +57,14 @@ def species(pokemon_details): print(load_info) # @@@DLT_SNIPPET_END example + # Run without __main__ + pipeline = dlt.pipeline( + pipeline_name="pokemon", destination="duckdb", dataset_name="pokemon_data" + ) + + # the pokemon_list resource does not need to be loaded + load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon")) + # test assertions row_counts = pipeline.last_trace.last_normalize_info.row_counts assert row_counts["pokemon"] == 20 diff --git a/docs/website/docs/examples/transformers/index.md b/docs/website/docs/examples/transformers/index.md index 860e830aae..056ba429b4 100644 --- a/docs/website/docs/examples/transformers/index.md +++ b/docs/website/docs/examples/transformers/index.md @@ -29,7 +29,7 @@ We'll learn how to: ### Loading code -```py +```python import dlt from dlt.sources.helpers import requests @@ -60,15 +60,13 @@ def source(pokemon_api_url: str): # a special case where just one item is retrieved in transformer # a whole transformer may be marked for parallel execution - @dlt.transformer - @dlt.defer + @dlt.transformer(parallelized=True) def species(pokemon_details): """Yields species details for a pokemon""" species_data = requests.get(pokemon_details["species"]["url"]).json() # link back to pokemon so we have a relation in loaded data species_data["pokemon_id"] = pokemon_details["id"] - # just return the results, if you yield, - # generator will be evaluated in main thread + # You can return the result instead of yield since the transformer only generates one result return species_data # create two simple pipelines with | operator diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index b7026c454e..2d7b2264fc 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -265,6 +265,33 @@ kinesis_stream = kinesis("telemetry_stream") ``` `kinesis_stream` resource has a name **telemetry_stream** + +### Declare parallel and async resources +You can extract multiple resources in parallel or with async IO. +To enable this for a sync resource you can set the `parallelized` flag to `True` in the resource decorator: + + +```python +@dlt.resource(parallelized=True) +def get_users(): + for u in _get_users(): + yield u + +@dlt.resource(parallelized=True) +def get_orders(): + for o in _get_orders(): + yield o +``` + +Async generators are automatically extracted concurrently with other resources: + +```python +@dlt.resource +async def get_users(): + async for u in _get_users(): # Assuming _get_users is an async generator + yield u +``` + ## Customize resources ### Filter, transform and pivot data diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index af2d791324..fc712d6ce2 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -12,7 +12,7 @@ If you can, yield pages when producing data. This makes some processes more effe the necessary function calls (each chunk of data that you yield goes through the extract pipeline once so if you yield a chunk of 10.000 items you will gain significant savings) For example: -```py +```python import dlt def get_rows(limit): @@ -27,7 +27,7 @@ def database_cursor(): can be replaced with: -```py +```python from itertools import islice @dlt.resource @@ -141,32 +141,66 @@ You can create pipelines that extract, normalize and load data in parallel. ### Extract You can extract data concurrently if you write your pipelines to yield callables or awaitables or use async generators for your resources that can be then evaluated in a thread or futures pool respectively. -The example below simulates a typical situation where a dlt resource is used to fetch a page of items and then details of individual items are fetched separately in the transformer. The `@dlt.defer` decorator wraps the `get_details` function in another callable that will be executed in the thread pool. +This is easily accomplished by using the `parallelized` argument in the resource decorator. +Resources based on sync generators will execute each step (yield) of the generator in a thread pool, so each individual resource is still extracted one item at a time but multiple such resources can run in parallel with each other. + +Consider an example source which consists of 2 resources fetching pages of items from different API endpoints, and each of those resources are piped to transformers to fetch complete data items respectively. + +The `parallelized=True` argument wraps the resources in a generator that yields callables to evaluate each generator step. These callables are executed in the thread pool. Transformer that are not generators (as shown in the example) are internally wrapped in a generator that yields once. + -```py +```python import dlt -from time import sleep +import time from threading import currentThread -@dlt.resource -def list_items(start, limit): - yield from range(start, start + limit) - -@dlt.transformer -@dlt.defer -def get_details(item_id): - # simulate a slow REST API where you wait 0.3 sec for each item - sleep(0.3) - print(f"item_id {item_id} in thread {currentThread().name}") - # just return the results, if you yield, generator will be evaluated in main thread - return {"row": item_id} +@dlt.resource(parallelized=True) +def list_users(n_users): + for i in range(1, 1 + n_users): + # Simulate network delay of a rest API call fetching a page of items + if i % 10 == 0: + time.sleep(0.1) + yield i + +@dlt.transformer(parallelized=True) +def get_user_details(user_id): + # Transformer that fetches details for users in a page + time.sleep(0.1) # Simulate latency of a rest API call + print(f"user_id {user_id} in thread {currentThread().name}") + return {"entity": "user", "id": user_id} + +@dlt.resource(parallelized=True) +def list_products(n_products): + for i in range(1, 1 + n_products): + if i % 10 == 0: + time.sleep(0.1) + yield i + +@dlt.transformer(parallelized=True) +def get_product_details(product_id): + time.sleep(0.1) + print(f"product_id {product_id} in thread {currentThread().name}") + return {"entity": "product", "id": product_id} + +@dlt.source +def api_data(): + return [ + list_users(24) | get_user_details, + list_products(32) | get_product_details, + ] # evaluate the pipeline and print all the items -# resources are iterators and they are evaluated in the same way in the pipeline.run -print(list(list_items(0, 10) | get_details)) +# sources are iterators and they are evaluated in the same way in the pipeline.run +print(list(api_data())) ``` +The `parallelized` flag in the `resource` and `transformer` decorators is supported for: + +* Generator functions (as shown in the example) +* Generators without functions (e.g. `dlt.resource(name='some_data', parallelized=True)(iter(range(100)))`) +* `dlt.transformer` decorated functions. These can be either generator functions or regular functions that return one value + You can control the number of workers in the thread pool with **workers** setting. The default number of workers is **5**. Below you see a few ways to do that with different granularity ```toml @@ -185,9 +219,10 @@ workers=4 -The example below does the same but using an async generator as the main resource and async/await and futures pool for the transformer: +The example below does the same but using an async generator as the main resource and async/await and futures pool for the transformer. +The `parallelized` flag is not supported or needed for async generators, these are wrapped and evaluated concurrently by default: -```py +```python import asyncio @dlt.resource @@ -314,7 +349,7 @@ workers=11 -```py +```pyhon import os import dlt from itertools import islice @@ -388,32 +423,24 @@ You can run several pipeline instances in parallel from a single process by plac separate threads. The most straightforward way is to use `ThreadPoolExecutor` and `asyncio` to execute pipeline methods. -```py +```python import asyncio import dlt from time import sleep from concurrent.futures import ThreadPoolExecutor -# create both futures and thread parallel resources - -def async_table(): - async def _gen(idx): - await asyncio.sleep(0.1) - return {"async_gen": idx} - - # just yield futures in a loop +# create both asyncio and thread parallel resources +@dlt.resource +async def async_table(): for idx_ in range(10): - yield _gen(idx_) + await asyncio.sleep(0.1) + yield {"async_gen": idx_} +@dlt.resource(parallelized=True) def defer_table(): - @dlt.defer - def _gen(idx): - sleep(0.1) - return {"thread_gen": idx} - - # just yield futures in a loop for idx_ in range(5): - yield _gen(idx_) + sleep(0.1) + yield idx_ def _run_pipeline(pipeline, gen_): # run the pipeline in a thread, also instantiate generators here! diff --git a/docs/website/docs/reference/performance_snippets/performance-snippets.py b/docs/website/docs/reference/performance_snippets/performance-snippets.py index a2ebd102a6..b8c9f0039d 100644 --- a/docs/website/docs/reference/performance_snippets/performance-snippets.py +++ b/docs/website/docs/reference/performance_snippets/performance-snippets.py @@ -44,25 +44,47 @@ def read_table(limit): def parallel_extract_callables_snippet() -> None: # @@@DLT_SNIPPET_START parallel_extract_callables import dlt - from time import sleep + import time from threading import currentThread - @dlt.resource - def list_items(start, limit): - yield from range(start, start + limit) - - @dlt.transformer - @dlt.defer - def get_details(item_id): - # simulate a slow REST API where you wait 0.3 sec for each item - sleep(0.3) - print(f"item_id {item_id} in thread {currentThread().name}") - # just return the results, if you yield, generator will be evaluated in main thread - return {"row": item_id} + @dlt.resource(parallelized=True) + def list_users(n_users): + for i in range(1, 1 + n_users): + # Simulate network delay of a rest API call fetching a page of items + if i % 10 == 0: + time.sleep(0.1) + yield i + + @dlt.transformer(parallelized=True) + def get_user_details(user_id): + # Transformer that fetches details for users in a page + time.sleep(0.1) # Simulate latency of a rest API call + print(f"user_id {user_id} in thread {currentThread().name}") + return {"entity": "user", "id": user_id} + + @dlt.resource(parallelized=True) + def list_products(n_products): + for i in range(1, 1 + n_products): + if i % 10 == 0: + time.sleep(0.1) + yield i + + @dlt.transformer(parallelized=True) + def get_product_details(product_id): + time.sleep(0.1) + print(f"product_id {product_id} in thread {currentThread().name}") + return {"entity": "product", "id": product_id} + + @dlt.source + def api_data(): + return [ + list_users(24) | get_user_details, + list_products(32) | get_product_details, + ] # evaluate the pipeline and print all the items - # resources are iterators and they are evaluated in the same way in the pipeline.run - print(list(list_items(0, 10) | get_details)) + # sources are iterators and they are evaluated in the same way in the pipeline.run + print(list(api_data())) # @@@DLT_SNIPPET_END parallel_extract_callables # @@@DLT_SNIPPET_START parallel_extract_awaitables @@ -127,26 +149,18 @@ def parallel_pipelines_asyncio_snippet() -> None: from time import sleep from concurrent.futures import ThreadPoolExecutor - # create both futures and thread parallel resources - - def async_table(): - async def _gen(idx): - await asyncio.sleep(0.1) - return {"async_gen": idx} - - # just yield futures in a loop + # create both asyncio and thread parallel resources + @dlt.resource + async def async_table(): for idx_ in range(10): - yield _gen(idx_) + await asyncio.sleep(0.1) + yield {"async_gen": idx_} + @dlt.resource(parallelized=True) def defer_table(): - @dlt.defer - def _gen(idx): - sleep(0.1) - return {"thread_gen": idx} - - # just yield futures in a loop for idx_ in range(5): - yield _gen(idx_) + sleep(0.1) + yield idx_ def _run_pipeline(pipeline, gen_): # run the pipeline in a thread, also instantiate generators here! From 85b7f62dbd0fe3ea8bce586dbec61ed26408dc7d Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Thu, 29 Feb 2024 22:11:19 +0100 Subject: [PATCH 26/27] small docs fixes --- .../examples/incremental_loading/code/zendesk-snippets.py | 2 +- docs/website/docs/general-usage/resource.md | 7 ++++++- docs/website/docs/reference/performance.md | 3 ++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py index 49893fe74e..05ea18cb9e 100644 --- a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py +++ b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py @@ -140,4 +140,4 @@ def get_pages( # check that stuff was loaded row_counts = pipeline.last_trace.last_normalize_info.row_counts - assert row_counts["ticket_events"] == 16 + assert row_counts["ticket_events"] == 17 diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index 2d7b2264fc..318f330849 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -267,7 +267,7 @@ kinesis_stream = kinesis("telemetry_stream") ### Declare parallel and async resources -You can extract multiple resources in parallel or with async IO. +You can extract multiple resources in parallel threads or with async IO. To enable this for a sync resource you can set the `parallelized` flag to `True` in the resource decorator: @@ -281,6 +281,9 @@ def get_users(): def get_orders(): for o in _get_orders(): yield o + +# users and orders will be iterated in parallel in two separate threads +pipeline.run(get_users(), get_orders()) ``` Async generators are automatically extracted concurrently with other resources: @@ -292,6 +295,8 @@ async def get_users(): yield u ``` +Please find more details in [extract performance](../reference/performance.md#extract) + ## Customize resources ### Filter, transform and pivot data diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index fc712d6ce2..42b042324b 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -269,7 +269,8 @@ of callables to be evaluated in a thread pool with a size of 5. This limit will ::: :::caution -Generators and iterators are always evaluated in the main thread. If you have a loop that yields items, instead yield functions or async functions that will create the items when evaluated in the pool. +Generators and iterators are always evaluated in a single thread: item by item. If you have a loop that yields items that you want to evaluate +in parallel, instead yield functions or async functions that will be evaluates in separate threads or in async pool. ::: ### Normalize From d376cb5f8b40eea22f8fc2be81377ed67372cac9 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Thu, 29 Feb 2024 23:26:13 +0100 Subject: [PATCH 27/27] logs daemon signals message instead of printing --- dlt/common/runtime/signals.py | 6 +++++- .../reference/performance_snippets/performance-snippets.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dlt/common/runtime/signals.py b/dlt/common/runtime/signals.py index 2a5cc75135..8e64c8ba64 100644 --- a/dlt/common/runtime/signals.py +++ b/dlt/common/runtime/signals.py @@ -64,5 +64,9 @@ def delayed_signals() -> Iterator[None]: signal.signal(signal.SIGINT, original_sigint_handler) signal.signal(signal.SIGTERM, original_sigterm_handler) else: - print("Running in daemon thread, signals not enabled") + if not TYPE_CHECKING: + from dlt.common.runtime import logger + else: + logger: Any = None + logger.info("Running in daemon thread, signals not enabled") yield diff --git a/docs/website/docs/reference/performance_snippets/performance-snippets.py b/docs/website/docs/reference/performance_snippets/performance-snippets.py index b8c9f0039d..68ec8ed72d 100644 --- a/docs/website/docs/reference/performance_snippets/performance-snippets.py +++ b/docs/website/docs/reference/performance_snippets/performance-snippets.py @@ -187,9 +187,9 @@ async def _run_async(): asyncio.run(_run_async()) # activate pipelines before they are used pipeline_1.activate() - # assert load_data_table_counts(pipeline_1) == {"async_table": 10} + assert pipeline_1.last_trace.last_normalize_info.row_counts["async_table"] == 10 pipeline_2.activate() - # assert load_data_table_counts(pipeline_2) == {"defer_table": 5} + assert pipeline_2.last_trace.last_normalize_info.row_counts["defer_table"] == 5 # @@@DLT_SNIPPET_END parallel_pipelines