dlt-hub · rudolfix · Jan 30, 2024 · Jan 22, 2024 · Jan 23, 2024 · Jan 23, 2024
diff --git a/dlt/extract/pipe.py b/dlt/extract/pipe.py
@@ -55,6 +55,7 @@
     simulate_func_call,
     wrap_compat_transformer,
     wrap_resource_gen,
+    wrap_async_generator,
 )
 
 if TYPE_CHECKING:
@@ -321,6 +322,10 @@ def evaluate_gen(self) -> None:
             # verify if transformer can be called
             self._ensure_transform_step(self._gen_idx, gen)
 
+        # wrap async generator
+        if inspect.isasyncgen(self.gen):
+            self.replace_gen(wrap_async_generator(self.gen))
+
         # evaluate transforms
         for step_no, step in enumerate(self._steps):
             # print(f"pipe {self.name} step no {step_no} step({step})")
@@ -366,9 +371,10 @@ def _wrap_gen(self, *args: Any, **kwargs: Any) -> Any:
 
     def _verify_head_step(self, step: TPipeStep) -> None:
         # first element must be Iterable, Iterator or Callable in resource pipe
-        if not isinstance(step, (Iterable, Iterator)) and not callable(step):
+        if not isinstance(step, (Iterable, Iterator, AsyncIterator)) and not callable(step):
             raise CreatePipeException(
-                self.name, "A head of a resource pipe must be Iterable, Iterator or a Callable"
+                self.name,
+                "A head of a resource pipe must be Iterable, Iterator, AsyncIterator or a Callable",
             )
 
     def _wrap_transform_step_meta(self, step_no: int, step: TPipeStep) -> TPipeStep:
@@ -619,6 +625,16 @@ def __next__(self) -> PipeItem:
                 pipe_item = None
                 continue
 
+            # handle async iterator items as new source
+            if inspect.isasyncgen(item):
+                self._sources.append(
+                    SourcePipeItem(
+                        wrap_async_generator(item), pipe_item.step, pipe_item.pipe, pipe_item.meta
+                    )
+                )
+                pipe_item = None
+                continue
+
             if isinstance(item, Awaitable) or callable(item):
                 # do we have a free slot or one of the slots is done?
                 if len(self._futures) < self.max_parallel_items or self._next_future() >= 0:
@@ -773,6 +789,8 @@ def _resolve_futures(self) -> ResolvablePipeItem:
 
         if future.exception():
             ex = future.exception()
+            if isinstance(ex, StopAsyncIteration):
+                return None
             if isinstance(
                 ex, (PipelineException, ExtractorException, DltSourceException, PipeException)
             ):
@@ -791,6 +809,27 @@ def _get_source_item(self) -> ResolvablePipeItem:
         elif self._next_item_mode == "round_robin":
             return self._get_source_item_round_robin()
 
+    def _get_next_item_from_generator(
+        self,
+        gen: Any,
+        step: int,
+        pipe: Pipe,
+        meta: Any,
+    ) -> ResolvablePipeItem:
+        item: ResolvablePipeItem = next(gen)
+        if item is None:
+            return item
+        # full pipe item may be returned, this is used by ForkPipe step
+        # to redirect execution of an item to another pipe
+        if isinstance(item, ResolvablePipeItem):
+            return item
+        else:
+            # keep the item assigned step and pipe when creating resolvable item
+            if isinstance(item, DataItemWithMeta):
+                return ResolvablePipeItem(item.data, step, pipe, item.meta)
+            else:
+                return ResolvablePipeItem(item, step, pipe, meta)
+
     def _get_source_item_current(self) -> ResolvablePipeItem:
         # no more sources to iterate
         if len(self._sources) == 0:
@@ -803,17 +842,8 @@ def _get_source_item_current(self) -> ResolvablePipeItem:
             set_current_pipe_name(pipe.name)
             item = None
             while item is None:
-                item = next(gen)
-            # full pipe item may be returned, this is used by ForkPipe step
-            # to redirect execution of an item to another pipe
-            if isinstance(item, ResolvablePipeItem):
-                return item
-            else:
-                # keep the item assigned step and pipe when creating resolvable item
-                if isinstance(item, DataItemWithMeta):
-                    return ResolvablePipeItem(item.data, step, pipe, item.meta)
-                else:
-                    return ResolvablePipeItem(item, step, pipe, meta)
+                item = self._get_next_item_from_generator(gen, step, pipe, meta)
+            return item
         except StopIteration:
             # remove empty iterator and try another source
             self._sources.pop()
@@ -839,17 +869,8 @@ def _get_source_item_round_robin(self) -> ResolvablePipeItem:
                 self._round_robin_index = (self._round_robin_index + 1) % sources_count
                 gen, step, pipe, meta = self._sources[self._round_robin_index]
                 set_current_pipe_name(pipe.name)
-                item = next(gen)
-            # full pipe item may be returned, this is used by ForkPipe step
-            # to redirect execution of an item to another pipe
-            if isinstance(item, ResolvablePipeItem):
-                return item
-            else:
-                # keep the item assigned step and pipe when creating resolvable item
-                if isinstance(item, DataItemWithMeta):
-                    return ResolvablePipeItem(item.data, step, pipe, item.meta)
-                else:
-                    return ResolvablePipeItem(item, step, pipe, meta)
+                item = self._get_next_item_from_generator(gen, step, pipe, meta)
+            return item
         except StopIteration:
             # remove empty iterator and try another source
             self._sources.pop(self._round_robin_index)

diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py
@@ -123,8 +123,6 @@ def from_data(
         data = wrap_additional_type(data)
 
         # several iterable types are not allowed and must be excluded right away
-        if isinstance(data, (AsyncIterator, AsyncIterable)):
-            raise InvalidResourceDataTypeAsync(name, data, type(data))
         if isinstance(data, (str, dict)):
             raise InvalidResourceDataTypeBasic(name, data, type(data))
 
@@ -135,7 +133,7 @@ def from_data(
             parent_pipe = DltResource._get_parent_pipe(name, data_from)
 
         # create resource from iterator, iterable or generator function
-        if isinstance(data, (Iterable, Iterator)) or callable(data):
+        if isinstance(data, (Iterable, Iterator, AsyncIterable)) or callable(data):
             pipe = Pipe.from_data(name, data, parent=parent_pipe)
             return cls(
                 pipe,

diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py
@@ -1,6 +1,6 @@
 import inspect
 import makefun
-from typing import Optional, Tuple, Union, List, Any, Sequence, cast
+from typing import Optional, Tuple, Union, List, Any, Sequence, cast, Iterator
 from collections.abc import Mapping as C_Mapping
 
 from dlt.common.exceptions import MissingDependencyException
@@ -119,6 +119,29 @@ def check_compat_transformer(name: str, f: AnyFun, sig: inspect.Signature) -> in
     return meta_arg
 
 
+def wrap_async_generator(gen: Any) -> Any:
+    """Wraps an async generator into a list of awaitables"""
+    is_running = False
+    exhausted = False
+
+    async def run() -> TDataItems:
+        nonlocal is_running, exhausted
+        try:
+            return await gen.__anext__()
+        except StopAsyncIteration:
+            exhausted = True
+            raise
+        finally:
+            is_running = False
+
+    # it is best to use the round robin strategy here if multiple async generators are used in resources
+    while not exhausted:
+        while is_running:
+            yield None
+        is_running = True
+        yield run()
+
+
 def wrap_compat_transformer(
     name: str, f: AnyFun, sig: inspect.Signature, *args: Any, **kwargs: Any
 ) -> AnyFun:
@@ -142,8 +165,12 @@ def wrap_resource_gen(
     name: str, f: AnyFun, sig: inspect.Signature, *args: Any, **kwargs: Any
 ) -> AnyFun:
     """Wraps a generator or generator function so it is evaluated on extraction"""
-    if inspect.isgeneratorfunction(inspect.unwrap(f)) or inspect.isgenerator(f):
-        # always wrap generators and generator functions. evaluate only at runtime!
+
+    if (
+        inspect.isgeneratorfunction(inspect.unwrap(f))
+        or inspect.isgenerator(f)
+        or inspect.isasyncgenfunction(f)
+    ):
 
         def _partial() -> Any:
             # print(f"_PARTIAL: {args} {kwargs}")

diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py
@@ -1631,29 +1631,133 @@ def api_fetch(page_num):
     assert pipeline.last_trace.last_normalize_info.row_counts["product"] == 12
 
 
-@pytest.mark.skip("skipped until async generators are implemented")
-def test_async_generator() -> None:
+#
+# async generators resource tests
+#
+def test_async_generator_resource() -> None:
+    async def async_gen_table():
+        for l_ in ["a", "b", "c"]:
+            await asyncio.sleep(0.1)
+            yield {"letter": l_}
+
+    @dlt.resource
+    async def async_gen_resource():
+        for l_ in ["d", "e", "f"]:
+            await asyncio.sleep(0.1)
+            yield {"letter": l_}
+
+    pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True)
+
+    # pure async function
+    pipeline_1.run(async_gen_table(), table_name="async")
+    with pipeline_1.sql_client() as c:
+        with c.execute_query("SELECT * FROM async") as cur:
+            rows = list(cur.fetchall())
+            assert [r[0] for r in rows] == ["a", "b", "c"]
+
+    # async resource
+    pipeline_1.run(async_gen_resource(), table_name="async")
+    with pipeline_1.sql_client() as c:
+        with c.execute_query("SELECT * FROM async") as cur:
+            rows = list(cur.fetchall())
+            assert [r[0] for r in rows] == ["a", "b", "c", "d", "e", "f"]
+
+
+def test_async_generator_nested() -> None:
     def async_inner_table():
         async def _gen(idx):
             for l_ in ["a", "b", "c"]:
-                await asyncio.sleep(1)
+                await asyncio.sleep(0.1)
                 yield {"async_gen": idx, "letter": l_}
 
         # just yield futures in a loop
-        for idx_ in range(10):
+        for idx_ in range(3):
             yield _gen(idx_)
 
-    async def async_gen_table(idx):
+    pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True)
+    pipeline_1.run(async_inner_table(), table_name="async")
+    with pipeline_1.sql_client() as c:
+        with c.execute_query("SELECT * FROM async") as cur:
+            rows = list(cur.fetchall())
+            assert [(r[0], r[1]) for r in rows] == [
+                (0, "a"),
+                (0, "b"),
+                (0, "c"),
+                (1, "a"),
+                (1, "b"),
+                (1, "c"),
+                (2, "a"),
+                (2, "b"),
+                (2, "c"),
+            ]
+
+
+def test_async_generator_transformer() -> None:
+    @dlt.resource
+    async def async_resource():
         for l_ in ["a", "b", "c"]:
-            await asyncio.sleep(1)
-            yield {"async_gen": idx, "letter": l_}
+            await asyncio.sleep(0.1)
+            yield {"letter": l_}
 
-    @dlt.resource
-    async def async_gen_resource(idx):
+    @dlt.transformer(data_from=async_resource)
+    async def async_transformer(item):
+        await asyncio.sleep(0.1)
+        yield {
+            "letter": item["letter"] + "t",
+        }
+
+    pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True)
+    pipeline_1.run(async_transformer(), table_name="async")
+
+    with pipeline_1.sql_client() as c:
+        with c.execute_query("SELECT * FROM async") as cur:
+            rows = list(cur.fetchall())
+            assert len(rows) == 3
+            assert {r[0] for r in rows} == {"at", "bt", "ct"}
+
+
+@pytest.mark.parametrize("next_item_mode", ["fifo", "round_robin"])
+def test_parallel_async_generators(next_item_mode: str) -> None:
+    os.environ["EXTRACT__NEXT_ITEM_MODE"] = next_item_mode
+    execution_order = []
+
+    @dlt.resource(table_name="async1")
+    async def async_resource1():
         for l_ in ["a", "b", "c"]:
             await asyncio.sleep(1)
-            yield {"async_gen": idx, "letter": l_}
+            nonlocal execution_order
+            execution_order.append("one")
+            yield {"letter": l_}
+
+    @dlt.resource(table_name="async2")
+    async def async_resource2():
+        await asyncio.sleep(0.5)
+        for l_ in ["e", "f", "g"]:
+            await asyncio.sleep(1)
+            nonlocal execution_order
+            execution_order.append("two")
+            yield {"letter": l_}
+
+    @dlt.source
+    def source():
+        return [async_resource1(), async_resource2()]
 
     pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True)
-    pipeline_1.run(async_gen_resource(10))
-    pipeline_1.run(async_gen_table(11))
+    pipeline_1.run(source())
+
+    with pipeline_1.sql_client() as c:
+        with c.execute_query("SELECT * FROM async1") as cur:
+            rows = list(cur.fetchall())
+            assert len(rows) == 3
+            assert {r[0] for r in rows} == {"a", "b", "c"}
+
+        with c.execute_query("SELECT * FROM async2") as cur:
+            rows = list(cur.fetchall())
+            assert len(rows) == 3
+            assert {r[0] for r in rows} == {"e", "f", "g"}
+
+    assert (
+        execution_order == ["one", "two", "one", "two", "one", "two"]
+        if next_item_mode == "round_robin"
+        else ["one", "one", "one", "two", "two", "two"]
+    )