Merge branch 'devel' of https://github.com/dlt-hub/dlt into feat/1789…

…-scd2-incremental
dlt-hub · Sep 19, 2024 · 5888a07 · 5888a07
2 parents ae7cd00 + acf1e36
commit 5888a07
Show file tree

Hide file tree

Showing 152 changed files with 1,957 additions and 12,099 deletions.
diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml
@@ -67,6 +67,11 @@ jobs:
         with:
           python-version: "3.10.x"
 
+      - name: Setup node 20
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
       - name: Install Poetry
         uses: snok/install-poetry@v1
         with:
@@ -81,6 +86,9 @@ jobs:
           path: .venv
           key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
 
+      - name: run docs preprocessor
+        run: make preprocess-docs
+
       - name: Install dependencies
         # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
         run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -4,7 +4,7 @@ Thank you for considering contributing to **dlt**! We appreciate your help in ma
 
 ## Table of Contents
 
-1. [Getting Started](#getting-started)
+1. [Getting Started](#intro)
 2. [Submitting Changes](#submitting-changes)
 3. [Adding or updating core dependencies](#adding-or-updating-core-dependencies)
 4. [Linting](#linting)

diff --git a/Makefile b/Makefile
@@ -44,7 +44,7 @@ has-poetry:
 	poetry --version
 
 dev: has-poetry
-	poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk
+	poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk,airflow
 
 lint:
 	./tools/check-package.sh
@@ -107,4 +107,6 @@ test-build-images: build-library
 	docker build -f deploy/dlt/Dockerfile.airflow --build-arg=COMMIT_SHA="$(shell git log -1 --pretty=%h)" --build-arg=IMAGE_VERSION="$(shell poetry version -s)" .
 	# docker build -f deploy/dlt/Dockerfile --build-arg=COMMIT_SHA="$(shell git log -1 --pretty=%h)" --build-arg=IMAGE_VERSION="$(shell poetry version -s)" .
 
-
+preprocess-docs: 
+	# run docs preprocessing to run a few checks and ensure examples can be parsed
+	cd docs/website && npm i && npm run preprocess-docs
diff --git a/README.md b/README.md
@@ -94,15 +94,16 @@ You can find examples for various use cases in the [examples](docs/examples) fol
 
 ## Adding as dependency
 
-`dlt` follows the semantic versioning with the [`MAJOR.MINOR.PATCH`](https://peps.python.org/pep-0440/#semantic-versioning) pattern. Currently, we are using **pre-release versioning** with the major version being 0.
+`dlt` follows the semantic versioning with the [`MAJOR.MINOR.PATCH`](https://peps.python.org/pep-0440/#semantic-versioning) pattern.
 
-- `minor` version change means breaking changes
-- `patch` version change means new features that should be backward compatible
-- any suffix change, e.g., `post10` -> `post11`, is considered a patch
+* `major` means breaking changes and removed deprecations
+* `minor` new features, sometimes automatic migrations
+* `patch` bug fixes
 
 We suggest that you allow only `patch` level updates automatically:
-* Using the [Compatible Release Specifier](https://packaging.python.org/en/latest/specifications/version-specifiers/#compatible-release). For example **dlt~=0.3.10** allows only versions **>=0.3.10** and less than **<0.4**
-* Poetry [caret requirements](https://python-poetry.org/docs/dependency-specification/). For example **^0.3.10** allows only versions **>=0.3.10** to **<0.4**
+* Using the [Compatible Release Specifier](https://packaging.python.org/en/latest/specifications/version-specifiers/#compatible-release). For example **dlt~=1.0** allows only versions **>=1.0** and less than **<1.1**
+* Poetry [caret requirements](https://python-poetry.org/docs/dependency-specification/). For example **^1.0** allows only versions **>=1.0** to **<1.0**
+
 ## Get Involved
 
 The dlt project is quickly growing, and we're excited to have you join our community! Here's how you can get involved:

diff --git a/dlt/common/normalizers/naming/snake_case.py b/dlt/common/normalizers/naming/snake_case.py
@@ -21,7 +21,7 @@ class NamingConvention(BaseNamingConvention):
     - Replaces all trailing `_` with `x`
     - Replaces `+` and `*` with `x`, `-` with `_`, `@` with `a` and `|` with `l`
 
-    Uses __ as patent-child separator for tables and flattened column names.
+    Uses __ as parent-child separator for tables and flattened column names.
     """
 
     RE_UNDERSCORES: ClassVar[REPattern] = RE_UNDERSCORES

diff --git a/dlt/common/schema/migrations.py b/dlt/common/schema/migrations.py
@@ -34,7 +34,8 @@ def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) ->
         # current version of the schema
         current = cast(TStoredSchema, schema_dict)
         # add default normalizers and root hash propagation
-        normalizers = explicit_normalizers()
+        # use explicit None to get default settings. ignore any naming conventions
+        normalizers = explicit_normalizers(naming=None, json_normalizer=None)
         current["normalizers"], _, _ = import_normalizers(normalizers, normalizers)
         current["normalizers"]["json"]["config"] = {
             "propagation": {"root": {"_dlt_id": "_dlt_root_id"}}

diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py
@@ -138,6 +138,7 @@ class TColumnPropInfo(NamedTuple):
 
 class TColumnType(TypedDict, total=False):
     data_type: Optional[TDataType]
+    nullable: Optional[bool]
     precision: Optional[int]
     scale: Optional[int]
     timezone: Optional[bool]
@@ -147,7 +148,6 @@ class TColumnSchemaBase(TColumnType, total=False):
     """TypedDict that defines basic properties of a column: name, data type and nullable"""
 
     name: Optional[str]
-    nullable: Optional[bool]
 
 
 class TColumnSchema(TColumnSchemaBase, total=False):

diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
@@ -164,9 +164,12 @@ def _storage_options(self) -> Dict[str, str]:
         return _deltalake_storage_options(self._job_client.config)
 
     def _delta_table(self) -> Optional["DeltaTable"]:  # type: ignore[name-defined] # noqa: F821
-        from dlt.common.libs.deltalake import try_get_deltatable
+        from dlt.common.libs.deltalake import DeltaTable
 
-        return try_get_deltatable(self.make_remote_url(), storage_options=self._storage_options)
+        if DeltaTable.is_deltatable(self.make_remote_url(), storage_options=self._storage_options):
+            return DeltaTable(self.make_remote_url(), storage_options=self._storage_options)
+        else:
+            return None
 
     @property
     def _partition_columns(self) -> List[str]:

diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
@@ -802,7 +802,7 @@ def gen_scd2_sql(
         # insert list elements for new active records in nested tables
         nested_tables = table_chain[1:]
         if nested_tables:
-            # TODO: - based on deterministic child hashes (OK)
+            # TODO: - based on deterministic nested hashes (OK)
             # - if row hash changes all is right
             # - if it does not we only capture new records, while we should replace existing with those in stage
             # - this write disposition is way more similar to regular merge (how root tables are handled is different, other tables handled same)

diff --git a/dlt/extract/source.py b/dlt/extract/source.py
@@ -232,7 +232,7 @@ def max_table_nesting(self, value: int) -> None:
 
     @property
     def root_key(self) -> bool:
-        """Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge"""
+        """Enables merging on all resources by propagating root foreign key to nested tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge"""
         # this also check the normalizer type
         config = RelationalNormalizer.get_normalizer_config(self._schema).get("propagation")
         data_normalizer = self._schema.data_item_normalizer

diff --git a/dlt/load/utils.py b/dlt/load/utils.py
@@ -27,7 +27,7 @@ def get_completed_table_chain(
     For append and merge write disposition, tables without jobs will be included, providing they have seen data (and were created in the destination)
     Optionally `being_completed_job_id` can be passed that is considered to be completed before job itself moves in storage
     """
-    # returns ordered list of tables from parent to child leaf tables
+    # returns ordered list of tables from parent to nested leaf tables
     table_chain: List[TTableSchema] = []
     # allow for jobless tables for those write disposition
     skip_jobless_table = top_merged_table["write_disposition"] not in (
@@ -99,7 +99,7 @@ def init_client(
     # get all tables that actually have load jobs with data
     tables_with_jobs = set(job.table_name for job in new_jobs) - tables_no_data
 
-    # get tables to truncate by extending tables with jobs with all their child tables
+    # get tables to truncate by extending tables with jobs with all their nested tables
     initial_truncate_names = set(t["name"] for t in truncate_tables) if truncate_tables else set()
     truncate_table_names = set(
         _extend_tables_with_table_chain(
@@ -198,13 +198,13 @@ def _extend_tables_with_table_chain(
     haven't seen data or are not included by `include_table_filter`.
     Note that for root tables with replace and merge, the filter for tables that do not have jobs
 
-    Returns an unordered set of table names and their child tables
+    Returns an unordered set of table names and their nested tables
     """
     result: Set[str] = set()
     for table_name in tables:
         top_job_table = get_root_table(schema.tables, table_name)
         # for replace and merge write dispositions we should include tables
-        # without jobs in the table chain, because child tables may need
+        # without jobs in the table chain, because nested tables may need
         # processing due to changes in the root table
         skip_jobless_table = top_job_table["write_disposition"] not in (
             "replace",

diff --git a/docs/examples/nested_data/nested_data.py b/docs/examples/nested_data/nested_data.py
@@ -35,11 +35,11 @@
 CHUNK_SIZE = 10000
 
 
-# You can limit how deep dlt goes when generating child tables.
-# By default, the library will descend and generate child tables
+# You can limit how deep dlt goes when generating nested tables.
+# By default, the library will descend and generate nested tables
 # for all nested lists, without a limit.
-# In this example, we specify that we only want to generate child tables up to level 2,
-# so there will be only one level of child tables within child tables.
+# In this example, we specify that we only want to generate nested tables up to level 2,
+# so there will be only one level of nested tables within nested tables.
 @dlt.source(max_table_nesting=2)
 def mongodb_collection(
     connection_url: str = dlt.secrets.value,
@@ -149,7 +149,7 @@ def convert_mongo_objs(value: Any) -> Any:
     # The third method involves applying data type hints to specific columns in the data.
     # In this case, we tell dlt that column 'cast' (containing a list of actors)
     # in 'movies' table should have type 'json' which means
-    # that it will be loaded as JSON/struct and not as child table.
+    # that it will be loaded as JSON/struct and not as nested table.
     pipeline = dlt.pipeline(
         pipeline_name="mongodb_pipeline",
         destination="duckdb",

diff --git a/docs/examples/parent_child_relationship/test_parent_child_relationship.py b/docs/examples/parent_child_relationship/test_parent_child_relationship.py
diff --git a/...les/parent_child_relationship/__init__.py → docs/examples/propagate_hints/__init__.py b/...les/parent_child_relationship/__init__.py → docs/examples/propagate_hints/__init__.py
diff --git a/...relationship/parent_child_relationship.py → ...amples/propagate_hints/propagate_hints.py b/...relationship/parent_child_relationship.py → ...amples/propagate_hints/propagate_hints.py
@@ -1,12 +1,11 @@
 """
 ---
-title: Load parent table records into child table
-description: Learn how to integrate custom parent keys into child records
-keywords: [parent child relationship, parent key]
+title: Propagate primary_key from root to nested tables
+description: Learn how to propagate any column to nested tables
+keywords: [root table, nested reference, parent key]
 ---
 
-This example demonstrates handling data with parent-child relationships using the `dlt` library.
-You learn how to integrate specific fields (e.g., primary, foreign keys) from a parent record into each child record.
+You learn how to propagate specific fields (e.g., primary, foreign keys) from a parent record into each child record.
 
 In this example, we'll explore how to:
 

diff --git a/docs/technical/customization_and_hacking.md b/docs/technical/customization_and_hacking.md