dlt-hub · sh-rp · Oct 8, 2024 · Jun 19, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/composable_pipeline_1.py b/composable_pipeline_1.py
@@ -0,0 +1,109 @@
+"""Example of a composable pipeline"""
+
+import dlt
+import os
+import random
+from dlt.destinations import filesystem, duckdb
+
+# fixtures
+customers = [
+    {"id": 1, "name": "dave"},
+    {"id": 2, "name": "marcin"},
+    {"id": 3, "name": "anton"},
+    {"id": 4, "name": "alena"},
+]
+
+products = [
+    {"name": "apple", "price": 1},
+    {"name": "pear", "price": 2},
+    {"name": "banana", "price": 3},
+    {"name": "schnaps", "price": 10},
+]
+
+if __name__ == "__main__":
+    os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True"
+
+    #
+    # 1. let's load some stuff to a duckdb pipeline (standin for a remote location)
+    #
+    duck_pipeline = dlt.pipeline(
+        pipeline_name="warehouse", destination=duckdb(credentials="warehouse.duckdb")
+    )
+
+    @dlt.resource(write_disposition="replace", table_name="customers")
+    def c():
+        yield from customers
+
+    @dlt.resource(write_disposition="replace", table_name="orders")
+    def o():
+        order_no = 0
+        # every customer orders 4 things everyday
+        for weekday in ["monday", "tuesday", "wednesday"]:
+            for customer in customers:
+                for i in range(4):
+                    order_no += 1
+                    product = random.choice(products)
+                    yield {
+                        "order_day": weekday,
+                        "id": order_no,
+                        "customer_id": customer["id"],
+                        "product": product["name"],
+                        "price": product["price"],
+                    }
+
+    # run and print result
+    print("RUNNING WAREHOUSE INGESTION")
+    print(duck_pipeline.run([c(), o()]))
+    print(duck_pipeline.dataset.customers.df())
+    print(duck_pipeline.dataset.orders.df())
+    print("===========================")
+
+    #
+    # 2. now we want a local snapshot of the customers and all orders on tuesday in a datalake
+    #
+    lake_pipeline = dlt.pipeline(
+        pipeline_name="local_lake", destination=filesystem(bucket_url="./local_lake")
+    )
+
+    print("RUNNING LOCAL SNAPSHOT EXTRACTION")
+    lake_pipeline.run(
+        duck_pipeline.dataset.customers.iter_df(),
+        loader_file_format="jsonl",
+        table_name="customers",
+        write_disposition="replace",
+    )
+    lake_pipeline.run(
+        duck_pipeline.dataset.sql(
+            "SELECT * FROM orders WHERE orders.order_day = 'tuesday'"
+        ).iter_df(),
+        loader_file_format="jsonl",
+        table_name="orders",
+        write_disposition="replace",
+    )
+
+    print(lake_pipeline.dataset.customers.df())
+    print(lake_pipeline.dataset.orders.df())
+    print("===========================")
+
+    #
+    # 3. now we create a denormalized table locally
+    #
+
+    print("RUNNING DENORMALIZED TABLE EXTRACTION")
+    denom_pipeline = dlt.pipeline(
+        pipeline_name="denom_lake", destination=filesystem(bucket_url="./denom_lake")
+    )
+
+    denom_pipeline.run(
+        lake_pipeline.dataset.sql(
+            sql=(
+                "SELECT orders.*, customers.name FROM orders LEFT JOIN customers ON"
+                " orders.customer_id = customers.id"
+            ),
+            prepare_tables=["customers", "orders"],
+        ).iter_df(),
+        loader_file_format="jsonl",
+        table_name="customers",
+        write_disposition="replace",
+    )
+    print(denom_pipeline.dataset.customers.df())
diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
@@ -1,6 +1,8 @@
 from abc import ABC, abstractmethod
 import dataclasses
 from importlib import import_module
+from contextlib import contextmanager
+
 from types import TracebackType
 from typing import (
     Callable,
@@ -18,17 +20,23 @@
     Any,
     TypeVar,
     Generic,
+    Generator,
+    TYPE_CHECKING,
+    Protocol,
+    Tuple,
 )
 from typing_extensions import Annotated
 import datetime  # noqa: 251
 from copy import deepcopy
 import inspect
 
 from dlt.common import logger
+from dlt.common.typing import DataFrame, ArrowTable
 from dlt.common.configuration.specs.base_configuration import extract_inner_hint
 from dlt.common.destination.utils import verify_schema_capabilities
 from dlt.common.exceptions import TerminalValueError
 from dlt.common.normalizers.naming import NamingConvention
+
 from dlt.common.schema import Schema, TTableSchema, TSchemaTables
 from dlt.common.schema.utils import (
     get_file_format,
@@ -51,6 +59,7 @@
 from dlt.common.storages.load_storage import ParsedLoadJobFileName
 from dlt.common.storages.load_package import LoadJobInfo, TPipelineStateDoc
 
+
 TLoaderReplaceStrategy = Literal["truncate-and-insert", "insert-from-staging", "staging-optimized"]
 TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration")
 TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase")
@@ -561,6 +570,53 @@ def should_truncate_table_before_load_on_staging_destination(self, table: TTable
         return True
 
 
+class SupportsDataAccess(Protocol):
+    """Add support accessing data items"""
+
+    def df(self, chunk_size: int = None, **kwargs: None) -> Optional[DataFrame]:
+        """Fetches the results as data frame. For large queries the results may be chunked
+
+        Fetches the results into a data frame. The default implementation uses helpers in `pandas.io.sql` to generate Pandas data frame.
+        This function will try to use native data frame generation for particular destination. For `BigQuery`: `QueryJob.to_dataframe` is used.
+        For `duckdb`: `DuckDBPyConnection.df'
+
+        Args:
+            chunk_size (int, optional): Will chunk the results into several data frames. Defaults to None
+            **kwargs (Any): Additional parameters which will be passed to native data frame generation function.
+
+        Returns:
+            Optional[DataFrame]: A data frame with query results. If chunk_size > 0, None will be returned if there is no more data in results
+        """
+        ...
+
+    def arrow(self, *, chunk_size: int = None) -> Optional[ArrowTable]: ...
+
+    def iter_df(self, chunk_size: int) -> Generator[DataFrame, None, None]: ...
+
+    def iter_arrow(self, chunk_size: int) -> Generator[ArrowTable, None, None]: ...
+
+    def fetchall(self) -> List[Tuple[Any, ...]]: ...
+
+    def fetchmany(self, chunk_size: int) -> List[Tuple[Any, ...]]: ...
+
+    def iter_fetchmany(self, chunk_size: int) -> Generator[List[Tuple[Any, ...]], Any, Any]: ...
+
+    def fetchone(self) -> Optional[Tuple[Any, ...]]: ...
+
+
+class SupportsRelationshipAccess(ABC):
+    """Add support for accessing a cursor for a given relationship or query"""
+
+    @abstractmethod
+    def cursor_for_relation(
+        self,
+        *,
+        table: str = None,
+        sql: str = None,
+        prepare_tables: List[str] = None,
+    ) -> ContextManager[SupportsDataAccess]: ...
+
+
 # TODO: type Destination properly
 TDestinationReferenceArg = Union[
     str, "Destination[Any, Any]", Callable[..., "Destination[Any, Any]"], None

diff --git a/dlt/common/typing.py b/dlt/common/typing.py
@@ -77,6 +77,22 @@
     REPattern = _REPattern
     PathLike = os.PathLike
 
+
+try:
+    from pandas import DataFrame
+except ImportError:
+    DataFrame: Type[Any] = None  # type: ignore
+
+try:
+    from pyarrow import Table as ArrowTable
+except ImportError:
+    ArrowTable: Type[Any] = None  # type: ignore
+
+try:
+    from duckdb import DuckDBPyConnection
+except ImportError:
+    DuckDBPyConnection: Type[Any] = None  # type: ignore
+
 AnyType: TypeAlias = Any
 NoneType = type(None)
 DictStrAny: TypeAlias = Dict[str, Any]

diff --git a/dlt/dataset.py b/dlt/dataset.py
@@ -0,0 +1,128 @@
+from typing import cast, Any, TYPE_CHECKING, Generator, List, Tuple, Optional
+
+from contextlib import contextmanager
+
+from dlt.common.destination.reference import SupportsRelationshipAccess, SupportsDataAccess
+
+from dlt.common.typing import DataFrame, ArrowTable
+
+
+class Relation:
+    def __init__(
+        self, *, pipeline: Any, table: str = None, sql: str = None, prepare_tables: List[str] = None
+    ) -> None:
+        """Create a lazy evaluated relation to for the dataset of a pipeline"""
+        from dlt.pipeline import Pipeline
+
+        self.pipeline: Pipeline = cast(Pipeline, pipeline)
+        self.prepare_tables = prepare_tables
+        self.sql = sql
+        self.table = table
+
+    @contextmanager
+    def _client(self) -> Generator[SupportsRelationshipAccess, Any, Any]:
+        from dlt.destinations.job_client_impl import SqlJobClientBase
+        from dlt.destinations.fs_client import FSClientBase
+
+        client = self.pipeline.destination_client()
+
+        if isinstance(client, SqlJobClientBase):
+            with client.sql_client as sql_client:
+                yield sql_client
+            return
+
+        if isinstance(client, FSClientBase):
+            yield client
+            return
+
+        raise Exception(
+            f"Destination {client.config.destination_type} does not support data access via"
+            " dataset."
+        )
+
+    @contextmanager
+    def cursor(self) -> Generator[SupportsDataAccess, Any, Any]:
+        """Gets a DBApiCursor for the current relation"""
+        with self._client() as client:
+            with client.cursor_for_relation(
+                sql=self.sql, table=self.table, prepare_tables=self.prepare_tables
+            ) as cursor:
+                yield cursor
+
+    def df(
+        self,
+        *,
+        chunk_size: int = None,
+    ) -> DataFrame:
+        """Get first batch of table as dataframe"""
+        with self.cursor() as cursor:
+            return cursor.df(chunk_size=chunk_size)
+
+    def arrow(
+        self,
+        *,
+        chunk_size: int = None,
+    ) -> ArrowTable:
+        """Get first batch of table as arrow table"""
+        with self.cursor() as cursor:
+            return cursor.arrow(chunk_size=chunk_size)
+
+    def iter_df(
+        self,
+        *,
+        chunk_size: int,
+    ) -> Generator[DataFrame, None, None]:
+        """iterates over the whole table in dataframes of the given chunk_size, chunk_size of -1 will return the full table in the first batch"""
+        with self.cursor() as cursor:
+            yield from cursor.iter_df(
+                chunk_size=chunk_size,
+            )
+
+    def iter_arrow(
+        self,
+        *,
+        chunk_size: int,
+    ) -> Generator[ArrowTable, None, None]:
+        """iterates over the whole table in arrow tables of the given chunk_size, chunk_size of -1 will return the full table in the first batch"""
+        with self.cursor() as cursor:
+            yield from cursor.iter_arrow(
+                chunk_size=chunk_size,
+            )
+
+    def fetchall(self) -> List[Tuple[Any, ...]]:
+        with self.cursor() as cursor:
+            return cursor.fetchall()
+
+    def fetchmany(self, chunk_size: int) -> List[Tuple[Any, ...]]:
+        with self.cursor() as cursor:
+            return cursor.fetchmany(chunk_size)
+
+    def iter_fetchmany(self, chunk_size: int) -> Generator[List[Tuple[Any, ...]], Any, Any]:
+        with self.cursor() as cursor:
+            yield from cursor.iter_fetchmany(
+                chunk_size=chunk_size,
+            )
+
+    def fetchone(self) -> Optional[Tuple[Any, ...]]:
+        with self.cursor() as cursor:
+            return cursor.fetchone()
+
+
+class Dataset:
+    """Access to dataframes and arrowtables in the destination dataset"""
+
+    def __init__(self, pipeline: Any) -> None:
+        from dlt.pipeline import Pipeline
+
+        self.pipeline: Pipeline = cast(Pipeline, pipeline)
+
+    def sql(self, sql: str, prepare_tables: List[str] = None) -> Relation:
+        return Relation(pipeline=self.pipeline, sql=sql, prepare_tables=prepare_tables)
+
+    def __getitem__(self, table: str) -> Relation:
+        """access of table via dict notation"""
+        return Relation(pipeline=self.pipeline, table=table)
+
+    def __getattr__(self, table: str) -> Relation:
+        """access of table via property notation"""
+        return Relation(pipeline=self.pipeline, table=table)
diff --git a/dlt/destinations/fs_client.py b/dlt/destinations/fs_client.py
@@ -1,10 +1,14 @@
+from typing import Iterable, cast, Any, List, Literal
+
 import gzip
-from typing import Iterable, cast, Any, List
 from abc import ABC, abstractmethod
 from fsspec import AbstractFileSystem
 
+from dlt.common.typing import DuckDBPyConnection
+from dlt.common.destination.reference import SupportsRelationshipAccess
+
 
-class FSClientBase(ABC):
+class FSClientBase(SupportsRelationshipAccess, ABC):
     fs_client: AbstractFileSystem
 
     @property
@@ -55,3 +59,16 @@ def read_text(
             path, mode="rt", compression=compression, encoding=encoding, newline=newline
         ) as f:
             return cast(str, f.read())
+
+    @abstractmethod
+    def get_duckdb(
+        self,
+        tables: List[str],
+        db: DuckDBPyConnection = None,
+        table_type: Literal["view", "table"] = "view",
+    ) -> DuckDBPyConnection:
+        """
+        Returns a duckdb in memory instance with given tables loaded as views or tables.
+        Can also take an existing duckdb object to add tables from the filesystem.
+        """
+        pass