Skip to content

Commit

Permalink
add generated files
Browse files Browse the repository at this point in the history
  • Loading branch information
AstrakhantsevaAA committed Nov 7, 2023
1 parent 8ad09f3 commit 5ef7dc2
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 12 deletions.
Empty file.
2 changes: 2 additions & 0 deletions docs/examples/nested_data/.dlt/secrets.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[mongodb_pipeline.sources]
connection_url=""
Empty file.
138 changes: 138 additions & 0 deletions docs/examples/nested_data/nested_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from itertools import islice
from typing import Any, Dict, Iterator, Optional

from bson.decimal128 import Decimal128
from bson.objectid import ObjectId
from pendulum import _datetime
from pymongo import MongoClient

import dlt
from dlt.common.time import ensure_pendulum_datetime
from dlt.common.typing import TDataItem
from dlt.common.utils import map_nested_in_place

CHUNK_SIZE = 10000

# You can limit how deep dlt goes when generating child tables.
# By default, the library will descend and generate child tables
# for all nested lists, without a limit.
# In this example, we specify that we only want to generate child tables up to level 2,
# so there will be only one level of child tables within child tables.
@dlt.source(max_table_nesting=2)
def mongodb_collection(
connection_url: str = dlt.secrets.value,
database: Optional[str] = dlt.config.value,
collection: str = dlt.config.value,
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
write_disposition: Optional[str] = dlt.config.value,
) -> Any:
# set up mongo client
client = MongoClient(connection_url, uuidRepresentation="standard", tz_aware=True)
mongo_database = client.get_default_database() if not database else client[database]
collection_obj = mongo_database[collection]

def collection_documents(
client,
collection,
incremental: Optional[dlt.sources.incremental[Any]] = None,
) -> Iterator[TDataItem]:
LoaderClass = CollectionLoader

loader = LoaderClass(client, collection, incremental=incremental)
yield from loader.load_documents()

return dlt.resource( # type: ignore
collection_documents,
name=collection_obj.name,
primary_key="_id",
write_disposition=write_disposition,
)(client, collection_obj, incremental=incremental)


class CollectionLoader:
def __init__(
self,
client,
collection,
incremental: Optional[dlt.sources.incremental[Any]] = None,
) -> None:
self.client = client
self.collection = collection
self.incremental = incremental
if incremental:
self.cursor_field = incremental.cursor_path
self.last_value = incremental.last_value
else:
self.cursor_column = None
self.last_value = None

@property
def _filter_op(self) -> Dict[str, Any]:
if not self.incremental or not self.last_value:
return {}
if self.incremental.last_value_func is max:
return {self.cursor_field: {"$gte": self.last_value}}
elif self.incremental.last_value_func is min:
return {self.cursor_field: {"$lt": self.last_value}}
return {}

def load_documents(self) -> Iterator[TDataItem]:
cursor = self.collection.find(self._filter_op)
while docs_slice := list(islice(cursor, CHUNK_SIZE)):
yield map_nested_in_place(convert_mongo_objs, docs_slice)

def convert_mongo_objs(value: Any) -> Any:
if isinstance(value, (ObjectId, Decimal128)):
return str(value)
if isinstance(value, _datetime.datetime):
return ensure_pendulum_datetime(value)
return value


if __name__ == "__main__":
# When we created the source, we set max_table_nesting to 2.
# This ensures that the generated tables do not have more than two
# levels of nesting, even if the original data structure is more deeply nested.
pipeline = dlt.pipeline(
pipeline_name="mongodb_pipeline",
destination="duckdb",
dataset_name="unpacked_data",
)
source_data = mongodb_collection(
collection="movies", write_disposition="replace"
)
load_info = pipeline.run(source_data)
print(load_info)

# The second method involves setting the max_table_nesting attribute directly
# on the source data object.
# This allows for dynamic control over the maximum nesting
# level for a specific data source.
# Here the nesting level is adjusted before running the pipeline.
pipeline = dlt.pipeline(
pipeline_name="mongodb_pipeline",
destination="duckdb",
dataset_name="not_unpacked_data",
)
source_data = mongodb_collection(
collection="movies", write_disposition="replace"
)
source_data.max_table_nesting = 0
load_info = pipeline.run(source_data)
print(load_info)

# The third method involves applying data type hints to specific columns in the data.
# In this case, we tell dlt that column 'cast' (containing a list of actors)
# in 'movies' table should have type complex which means
# that it will be loaded as JSON/struct and not as child table.
pipeline = dlt.pipeline(
pipeline_name="mongodb_pipeline",
destination="duckdb",
dataset_name="unpacked_data_without_cast",
)
source_data = mongodb_collection(
collection="movies", write_disposition="replace"
)
source_data.movies.apply_hints(columns={"cast": {"data_type": "complex"}})
load_info = pipeline.run(source_data)
print(load_info)
4 changes: 4 additions & 0 deletions docs/website/docs/examples/nested_data/code/.dlt/secrets.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# @@@DLT_SNIPPET_START example
[mongodb_pipeline.sources]
connection_url=""
# @@@DLT_SNIPPET_END example
28 changes: 16 additions & 12 deletions docs/website/docs/examples/nested_data/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,17 @@ We'll learn how to:
<!--@@@DLT_SNIPPET_START code/nested_data-snippets.py::nested_data-->
```py
from itertools import islice
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, Optional

import dlt
from bson.decimal128 import Decimal128
from bson.objectid import ObjectId
from pendulum import _datetime
from pymongo import MongoClient

import dlt
from dlt.common.time import ensure_pendulum_datetime
from dlt.common.typing import TDataItem
from dlt.common.utils import map_nested_in_place
from pendulum import _datetime
from pymongo import ASCENDING, DESCENDING, MongoClient

CHUNK_SIZE = 10000

Expand All @@ -61,9 +62,7 @@ def mongodb_collection(
write_disposition: Optional[str] = dlt.config.value,
) -> Any:
# set up mongo client
client = MongoClient(
connection_url, uuidRepresentation="standard", tz_aware=True
)
client = MongoClient(connection_url, uuidRepresentation="standard", tz_aware=True)
mongo_database = client.get_default_database() if not database else client[database]
collection_obj = mongo_database[collection]

Expand All @@ -75,8 +74,7 @@ def mongodb_collection(
LoaderClass = CollectionLoader

loader = LoaderClass(client, collection, incremental=incremental)
for data in loader.load_documents():
yield data
yield from loader.load_documents()

return dlt.resource( # type: ignore
collection_documents,
Expand All @@ -100,7 +98,9 @@ if __name__ == "__main__":
destination="duckdb",
dataset_name="unpacked_data",
)
source_data = mongodb_collection(collection="movies", write_disposition="replace")
source_data = mongodb_collection(
collection="movies", write_disposition="replace"
)
load_info = pipeline.run(source_data)
print(load_info)

Expand All @@ -114,7 +114,9 @@ if __name__ == "__main__":
destination="duckdb",
dataset_name="not_unpacked_data",
)
source_data = mongodb_collection(collection="movies", write_disposition="replace")
source_data = mongodb_collection(
collection="movies", write_disposition="replace"
)
source_data.max_table_nesting = 0
load_info = pipeline.run(source_data)
print(load_info)
Expand All @@ -128,7 +130,9 @@ if __name__ == "__main__":
destination="duckdb",
dataset_name="unpacked_data_without_cast",
)
source_data = mongodb_collection(collection="movies", write_disposition="replace")
source_data = mongodb_collection(
collection="movies", write_disposition="replace"
)
source_data.movies.apply_hints(columns={"cast": {"data_type": "complex"}})
load_info = pipeline.run(source_data)
print(load_info)
Expand Down

0 comments on commit 5ef7dc2

Please sign in to comment.