improves data contract docs (#782)

Co-authored-by: Dave <[email protected]>
dlt-hub · Nov 23, 2023 · 0601d14 · 0601d14
1 parent 01998ef
commit 0601d14
Show file tree

Hide file tree

Showing 14 changed files with 244 additions and 111 deletions.
diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py
@@ -108,14 +108,14 @@ def __init__(
         schema_name: str,
         table_name: str,
         column_name: str,
-        contract_entity: TSchemaContractEntities,
+        schema_entity: TSchemaContractEntities,
         contract_mode: TSchemaEvolutionMode,
         table_schema: Any,
         schema_contract: TSchemaContractDict,
         data_item: Any = None,
         extended_info: str = None,
     ) -> None:
-        """Raised when `data_item` violates `contract_mode` on a `contract_entity` as defined by `table_schema`
+        """Raised when `data_item` violates `contract_mode` on a `schema_entity` as defined by `table_schema`
 
         Schema, table and column names are given as a context and full `schema_contract` and causing `data_item` as an evidence.
         """
@@ -128,7 +128,7 @@ def __init__(
         msg = (
             "In "
             + msg
-            + f" . Contract on {contract_entity} with mode {contract_mode} is violated. "
+            + f" . Contract on {schema_entity} with mode {contract_mode} is violated. "
             + (extended_info or "")
         )
         super().__init__(msg)
@@ -137,7 +137,7 @@ def __init__(
         self.column_name = column_name
 
         # violated contract
-        self.contract_entity = contract_entity
+        self.schema_entity = schema_entity
         self.contract_mode = contract_mode
 
         # some evidence

diff --git a/docs/examples/chess_production/chess.py b/docs/examples/chess_production/chess.py
@@ -6,6 +6,7 @@
 from dlt.common.typing import StrAny, TDataItems
 from dlt.sources.helpers.requests import client
 
+
 @dlt.source
 def chess(
     chess_url: str = dlt.config.value,
@@ -59,6 +60,7 @@ def players_games(username: Any) -> Iterator[TDataItems]:
 
 MAX_PLAYERS = 5
 
+
 def load_data_with_retry(pipeline, data):
     try:
         for attempt in Retrying(
@@ -68,9 +70,7 @@ def load_data_with_retry(pipeline, data):
             reraise=True,
         ):
             with attempt:
-                logger.info(
-                    f"Running the pipeline, attempt={attempt.retry_state.attempt_number}"
-                )
+                logger.info(f"Running the pipeline, attempt={attempt.retry_state.attempt_number}")
                 load_info = pipeline.run(data)
                 logger.info(str(load_info))
 
@@ -92,9 +92,7 @@ def load_data_with_retry(pipeline, data):
     # print the information on the first load package and all jobs inside
     logger.info(f"First load package info: {load_info.load_packages[0]}")
     # print the information on the first completed job in first load package
-    logger.info(
-        f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}"
-    )
+    logger.info(f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}")
 
     # check for schema updates:
     schema_updates = [p.schema_update for p in load_info.load_packages]
@@ -152,4 +150,4 @@ def load_data_with_retry(pipeline, data):
     )
     # get data for a few famous players
     data = chess(chess_url="https://api.chess.com/pub/", max_players=MAX_PLAYERS)
-    load_data_with_retry(pipeline, data)
+    load_data_with_retry(pipeline, data)
diff --git a/docs/examples/incremental_loading/zendesk.py b/docs/examples/incremental_loading/zendesk.py
@@ -6,12 +6,11 @@
 from dlt.common.typing import TAnyDateTime
 from dlt.sources.helpers.requests import client
 
+
 @dlt.source(max_table_nesting=2)
 def zendesk_support(
     credentials: Dict[str, str] = dlt.secrets.value,
-    start_date: Optional[TAnyDateTime] = pendulum.datetime(  # noqa: B008
-        year=2000, month=1, day=1
-    ),
+    start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1),  # noqa: B008
     end_date: Optional[TAnyDateTime] = None,
 ):
     """
@@ -113,11 +112,12 @@ def get_pages(
         if not response_json["end_of_stream"]:
             get_url = response_json["next_page"]
 
+
 if __name__ == "__main__":
     # create dlt pipeline
     pipeline = dlt.pipeline(
         pipeline_name="zendesk", destination="duckdb", dataset_name="zendesk_data"
     )
 
     load_info = pipeline.run(zendesk_support())
-    print(load_info)
+    print(load_info)
diff --git a/docs/examples/nested_data/nested_data.py b/docs/examples/nested_data/nested_data.py
@@ -13,6 +13,7 @@
 
 CHUNK_SIZE = 10000
 
+
 # You can limit how deep dlt goes when generating child tables.
 # By default, the library will descend and generate child tables
 # for all nested lists, without a limit.
@@ -81,6 +82,7 @@ def load_documents(self) -> Iterator[TDataItem]:
         while docs_slice := list(islice(cursor, CHUNK_SIZE)):
             yield map_nested_in_place(convert_mongo_objs, docs_slice)
 
+
 def convert_mongo_objs(value: Any) -> Any:
     if isinstance(value, (ObjectId, Decimal128)):
         return str(value)

diff --git a/docs/examples/transformers/pokemon.py b/docs/examples/transformers/pokemon.py
@@ -1,6 +1,7 @@
 import dlt
 from dlt.sources.helpers import requests
 
+
 @dlt.source(max_table_nesting=2)
 def source(pokemon_api_url: str):
     """"""
@@ -46,6 +47,7 @@ def species(pokemon_details):
 
     return (pokemon_list | pokemon, pokemon_list | pokemon | species)
 
+
 if __name__ == "__main__":
     # build duck db pipeline
     pipeline = dlt.pipeline(
@@ -54,4 +56,4 @@ def species(pokemon_details):
 
     # the pokemon_list resource does not need to be loaded
     load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon"))
-    print(load_info)
+    print(load_info)
diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md
@@ -93,5 +93,4 @@ We also see them.
 My observation is that if you write a lot of data into the database then close the connection and then open it again to write, there's a chance of such timeout. Possible **WAL** file is being written to the remote duckdb database.
 
 ### Invalid Input Error: Initialization function "motherduck_init" from file
-Use `duckdb 0.8.1`
-
+Use `duckdb 0.8.1` or above.
diff --git a/docs/website/docs/general-usage/credentials/configuration.md b/docs/website/docs/general-usage/credentials/configuration.md
@@ -210,7 +210,7 @@ You can pass destination credentials and ignore the default lookup:
 pipeline = dlt.pipeline(destination="postgres", credentials=dlt.secrets["postgres_dsn"])
 ```
 
-:::Note
+:::note
 **dlt.config** and **dlt.secrets** can be also used as setters. For example:
 ```python
 dlt.config["sheet_id"] = "23029402349032049"

diff --git a/docs/website/docs/general-usage/data-contracts.md b/docs/website/docs/general-usage/data-contracts.md
diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md
@@ -73,6 +73,9 @@ accepts following arguments:
 > 💡 You can mark some resource arguments as [configuration and credentials](credentials)
 > values so `dlt` can pass them automatically to your functions.
 
+### Put a contract on a tables, columns and data
+Use the `schema_contract` argument to tell dlt how to [deal with new tables, data types and bad data types](schema-contracts.md). For example if you set it to **freeze**, `dlt` will not allow for any new tables, columns or data types to be introduced to the schema - it will raise an exception. Learn more in on available contract modes [here](schema-contracts.md#setting-up-the-contract)
+
 ### Define a schema with Pydantic
 
 You can alternatively use a [Pydantic](https://pydantic-docs.helpmanual.io/) model to define the schema.
@@ -106,6 +109,8 @@ def get_users():
 The data types of the table columns are inferred from the types of the pydantic fields. These use the same type conversions
 as when the schema is automatically generated from the data.
 
+Pydantic models integrate well with [schema contracts](schema-contracts.md) as data validators.
+
 Things to note:
 
 - Fields with an `Optional` type are marked as `nullable`
@@ -131,6 +136,7 @@ behaviour of creating child tables for these fields.
 
 We do not support `RootModel` that validate simple types. You can add such validator yourself, see [data filtering section](#filter-transform-and-pivot-data).
 
+
 ### Dispatch data to many tables
 
 You can load data to many tables from a single resource. The most common case is a stream of events
@@ -307,7 +313,7 @@ assert list(r) == list(range(10))
 > 💡 You cannot limit transformers. They should process all the data they receive fully to avoid
 > inconsistencies in generated datasets.
 
-### Set table and adjust schema
+### Set table name and adjust schema
 
 You can change the schema of a resource, be it standalone or as a part of a source. Look for method
 named `apply_hints` which takes the same arguments as resource decorator. Obviously you should call