From d2e504533c14fc133c5c941a928f4a51bd92a48c Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Thu, 26 Sep 2024 18:17:18 +0200 Subject: [PATCH 01/29] shows sqlalchemy docs --- docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md | 2 +- docs/website/sidebars.js | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md b/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md index a3b19377da..b9014e0564 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md +++ b/docs/website/docs/dlt-ecosystem/destinations/sqlalchemy.md @@ -1,5 +1,5 @@ --- -title: SQL databases (powered by SQLAlchemy) +title: 30+ SQL databases (powered by SQLAlchemy) description: SQLAlchemy destination keywords: [sql, sqlalchemy, database, destination] --- diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 23c8d192ba..7e6000a2ca 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -171,6 +171,7 @@ const sidebars = { 'dlt-ecosystem/destinations/redshift', 'dlt-ecosystem/destinations/snowflake', 'dlt-ecosystem/destinations/athena', + 'dlt-ecosystem/destinations/sqlalchemy', 'dlt-ecosystem/destinations/weaviate', 'dlt-ecosystem/destinations/lancedb', 'dlt-ecosystem/destinations/qdrant', From 873f6befe99e833c5c3e9e590885a5457eb73c1e Mon Sep 17 00:00:00 2001 From: David Scharf Date: Fri, 27 Sep 2024 11:10:45 +0200 Subject: [PATCH 02/29] Fix config sections for synching destinations and accessing destination clients (#1887) * add config section for getting pipeline clients * add config section for sync_destination * prefers existing sections in pipeline --------- Co-authored-by: Marcin Rudolf --- dlt/pipeline/pipeline.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index fa10f5ac89..54e576b5fc 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -257,13 +257,17 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: return decorator -def with_config_section(sections: Tuple[str, ...]) -> Callable[[TFun], TFun]: +def with_config_section( + sections: Tuple[str, ...], merge_func: ConfigSectionContext.TMergeFunc = None +) -> Callable[[TFun], TFun]: def decorator(f: TFun) -> TFun: @wraps(f) def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: # add section context to the container to be used by all configuration without explicit sections resolution with inject_section( - ConfigSectionContext(pipeline_name=self.pipeline_name, sections=sections) + ConfigSectionContext( + pipeline_name=self.pipeline_name, sections=sections, merge_style=merge_func + ) ): return f(self, *args, **kwargs) @@ -678,7 +682,7 @@ def run( and not self._state_restored and (self.destination or destination) ): - self.sync_destination(destination, staging, dataset_name) + self._sync_destination(destination, staging, dataset_name) # sync only once self._state_restored = True # normalize and load pending data @@ -712,7 +716,7 @@ def run( else: return None - @with_schemas_sync + @with_config_section(sections=None, merge_func=ConfigSectionContext.prefer_existing) def sync_destination( self, destination: TDestinationReferenceArg = None, @@ -730,6 +734,17 @@ def sync_destination( Note: this method is executed by the `run` method before any operation on data. Use `restore_from_destination` configuration option to disable that behavior. """ + return self._sync_destination( + destination=destination, staging=staging, dataset_name=dataset_name + ) + + @with_schemas_sync + def _sync_destination( + self, + destination: TDestinationReferenceArg = None, + staging: TDestinationReferenceArg = None, + dataset_name: str = None, + ) -> None: self._set_destinations(destination=destination, staging=staging) self._set_dataset_name(dataset_name) @@ -969,6 +984,7 @@ def get_local_state_val(self, key: str) -> Any: state = self._get_state() return state["_local"][key] # type: ignore + @with_config_section(sections=None, merge_func=ConfigSectionContext.prefer_existing) def sql_client(self, schema_name: str = None) -> SqlClientBase[Any]: """Returns a sql client configured to query/change the destination and dataset that were used to load the data. Use the client with `with` statement to manage opening and closing connection to the destination: @@ -1008,6 +1024,7 @@ def _fs_client(self, schema_name: str = None) -> FSClientBase: return client raise FSClientNotAvailable(self.pipeline_name, self.destination.destination_name) + @with_config_section(sections=None, merge_func=ConfigSectionContext.prefer_existing) def destination_client(self, schema_name: str = None) -> JobClientBase: """Get the destination job client for the configured destination Use the client with `with` statement to manage opening and closing connection to the destination: From 5bbf0192c8bd79c1eeb12cd02c4633957a123886 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Willi=20M=C3=BCller?= Date: Fri, 27 Sep 2024 22:28:37 +0530 Subject: [PATCH 03/29] refactors Session mocking in tests to mocker.patch and mocker.spy API (#1891) --- tests/sources/helpers/rest_client/test_client.py | 13 +++---------- tests/sources/rest_api/integration/test_offline.py | 6 +----- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 5ec48e2972..488d7ef525 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -444,29 +444,22 @@ def test_configurable_timeout(self, mocker) -> None: import requests - original_send = requests.Session.send - requests.Session.send = mocker.Mock() # type: ignore[method-assign] + mocked_send = mocker.patch.object(requests.Session, "send") rest_client.get("/posts/1") - assert requests.Session.send.call_args[1] == { # type: ignore[attr-defined] + assert mocked_send.call_args[1] == { "timeout": 42, "proxies": ANY, "stream": ANY, "verify": ANY, "cert": ANY, } - # restore, otherwise side-effect on subsequent tests - requests.Session.send = original_send # type: ignore[method-assign] def test_request_kwargs(self, mocker) -> None: - def send_spy(*args, **kwargs): - return original_send(*args, **kwargs) - rest_client = RESTClient( base_url="https://api.example.com", session=Client().session, ) - original_send = rest_client.session.send - mocked_send = mocker.patch.object(rest_client.session, "send", side_effect=send_spy) + mocked_send = mocker.spy(rest_client.session, "send") rest_client.get( path="/posts/1", diff --git a/tests/sources/rest_api/integration/test_offline.py b/tests/sources/rest_api/integration/test_offline.py index 57cffc99d0..cb91e0d680 100644 --- a/tests/sources/rest_api/integration/test_offline.py +++ b/tests/sources/rest_api/integration/test_offline.py @@ -373,12 +373,8 @@ def test_multiple_response_actions_on_every_response(mock_api_server, mocker): class CustomSession(Session): pass - def send_spy(*args, **kwargs): - return original_send(*args, **kwargs) - my_session = CustomSession() - original_send = my_session.send - mocked_send = mocker.patch.object(my_session, "send", side_effect=send_spy) + mocked_send = mocker.spy(my_session, "send") source = rest_api_source( { From 3f1938aa0e260b2d859650a9670553e64719e7c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Willi=20M=C3=BCller?= Date: Sun, 29 Sep 2024 23:05:55 +0530 Subject: [PATCH 04/29] corrects test suite (#1893) --- tests/cli/test_init_command.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cli/test_init_command.py b/tests/cli/test_init_command.py index e85c4593f6..f76dc2f053 100644 --- a/tests/cli/test_init_command.py +++ b/tests/cli/test_init_command.py @@ -63,7 +63,7 @@ TEMPLATES = ["debug", "default", "arrow", "requests", "dataframe", "intro"] # a few verified sources we know to exist -SOME_KNOWN_VERIFIED_SOURCES = ["chess", "sql_database", "google_sheets", "pipedrive"] +SOME_KNOWN_VERIFIED_SOURCES = ["chess", "google_sheets", "pipedrive"] def get_verified_source_candidates(repo_dir: str) -> List[str]: @@ -150,7 +150,7 @@ def check_results(items: Dict[str, SourceConfiguration]) -> None: check_results(core_sources) verified_sources = _list_verified_sources(DEFAULT_VERIFIED_SOURCES_REPO) - assert set(SOME_KNOWN_VERIFIED_SOURCES).issubset(verified_sources) + assert set(SOME_KNOWN_VERIFIED_SOURCES).issubset(verified_sources.keys()) check_results(verified_sources) assert len(verified_sources.keys()) > 10 From cb450466346761e7da0e7f67666e0933654a7b8e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Sun, 29 Sep 2024 23:48:41 +0400 Subject: [PATCH 05/29] incremental `scd2` with `merge_key` (#1818) * remove unused imports * add scd2 retire_if_absent option * rewrite scd2 retire logic * include new keys in typing * finetune scd2 typing * update typeddict validation test * rename to retire_absent_rows * add reinsert test case * replace natural_key with merge_key * rewrite natural key presence check * simplify scd2 test and remove redundancy * set constants once * document incremental scd2 * remove natural_key remnants * remove `retire_absent_rows` flag * add scd2 merge key partition test * fix typos * update incremental scd2 docs --- dlt/common/schema/typing.py | 9 +- dlt/destinations/impl/athena/athena.py | 6 + dlt/destinations/sql_jobs.py | 34 +- dlt/extract/hints.py | 38 +- .../docs/general-usage/incremental-loading.md | 127 +++++- tests/common/test_validation.py | 8 +- tests/load/pipeline/test_scd2.py | 376 +++++++++++++----- 7 files changed, 462 insertions(+), 136 deletions(-) diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 2247358331..7174d1b5c7 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -232,15 +232,20 @@ class TWriteDispositionDict(TypedDict): disposition: TWriteDisposition -class TMergeDispositionDict(TWriteDispositionDict, total=False): +class TMergeDispositionDict(TWriteDispositionDict): strategy: Optional[TLoaderMergeStrategy] + + +class TScd2StrategyDict(TMergeDispositionDict, total=False): validity_column_names: Optional[List[str]] active_record_timestamp: Optional[TAnyDateTime] boundary_timestamp: Optional[TAnyDateTime] row_version_column_name: Optional[str] -TWriteDispositionConfig = Union[TWriteDisposition, TWriteDispositionDict, TMergeDispositionDict] +TWriteDispositionConfig = Union[ + TWriteDisposition, TWriteDispositionDict, TMergeDispositionDict, TScd2StrategyDict +] class _TTableSchemaBase(TTableProcessingHints, total=False): diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 04078dd510..72611a9568 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -149,6 +149,12 @@ def gen_delete_temp_table_sql( sql.insert(0, f"""DROP TABLE IF EXISTS {temp_table_name.replace('"', '`')};""") return sql, temp_table_name + @classmethod + def gen_concat_sql(cls, columns: Sequence[str]) -> str: + # Athena requires explicit casting + columns = [f"CAST({c} AS VARCHAR)" for c in columns] + return f"CONCAT({', '.join(columns)})" + @classmethod def requires_temp_table_for_delete(cls) -> bool: return True diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index 2407d2db62..ae27213a7c 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -339,6 +339,10 @@ def gen_delete_from_sql( ); """ + @classmethod + def gen_concat_sql(cls, columns: Sequence[str]) -> str: + return f"CONCAT({', '.join(columns)})" + @classmethod def _shorten_table_name(cls, ident: str, sql_client: SqlClientBase[Any]) -> str: """Trims identifier to max length supported by sql_client. Used for dynamically constructed table names""" @@ -755,19 +759,35 @@ def gen_scd2_sql( active_record_timestamp = get_active_record_timestamp(root_table) if active_record_timestamp is None: active_record_literal = "NULL" - is_active_clause = f"{to} IS NULL" + is_active = f"{to} IS NULL" else: # it's a datetime active_record_literal = format_datetime_literal( active_record_timestamp, caps.timestamp_precision ) - is_active_clause = f"{to} = {active_record_literal}" + is_active = f"{to} = {active_record_literal}" - # retire updated and deleted records - sql.append(f""" + # retire records: + # - no `merge_key`: retire all absent records + # - yes `merge_key`: retire those absent records whose `merge_key` + # is present in staging data + retire_sql = f""" {cls.gen_update_table_prefix(root_table_name)} {to} = {boundary_literal} - WHERE {is_active_clause} + WHERE {is_active} AND {hash_} NOT IN (SELECT {hash_} FROM {staging_root_table_name}); - """) + """ + merge_keys = cls._escape_list( + get_columns_names_with_prop(root_table, "merge_key"), + escape_column_id, + ) + if len(merge_keys) > 0: + if len(merge_keys) == 1: + key = merge_keys[0] + else: + key = cls.gen_concat_sql(merge_keys) # compound key + key_present = f"{key} IN (SELECT {key} FROM {staging_root_table_name})" + retire_sql = retire_sql.rstrip()[:-1] # remove semicolon + retire_sql += f" AND {key_present};" + sql.append(retire_sql) # insert new active records in root table columns = map(escape_column_id, list(root_table["columns"].keys())) @@ -776,7 +796,7 @@ def gen_scd2_sql( INSERT INTO {root_table_name} ({col_str}, {from_}, {to}) SELECT {col_str}, {boundary_literal} AS {from_}, {active_record_literal} AS {to} FROM {staging_root_table_name} AS s - WHERE {hash_} NOT IN (SELECT {hash_} FROM {root_table_name} WHERE {is_active_clause}); + WHERE {hash_} NOT IN (SELECT {hash_} FROM {root_table_name} WHERE {is_active}); """) # insert list elements for new active records in nested tables diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 037ebbddf9..2774e17353 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -12,6 +12,7 @@ TTableSchemaColumns, TWriteDispositionConfig, TMergeDispositionDict, + TScd2StrategyDict, TAnySchemaColumns, TTableFormat, TSchemaContract, @@ -352,7 +353,7 @@ def _set_hints( self, hints_template: TResourceHints, create_table_variant: bool = False ) -> None: DltResourceHints.validate_dynamic_hints(hints_template) - DltResourceHints.validate_write_disposition_hint(hints_template.get("write_disposition")) + DltResourceHints.validate_write_disposition_hint(hints_template) if create_table_variant: table_name: str = hints_template["name"] # type: ignore[assignment] # incremental cannot be specified in variant @@ -452,10 +453,11 @@ def _merge_merge_disposition_dict(dict_: Dict[str, Any]) -> None: md_dict: TMergeDispositionDict = dict_.pop("write_disposition") if merge_strategy := md_dict.get("strategy"): dict_["x-merge-strategy"] = merge_strategy - if "boundary_timestamp" in md_dict: - dict_["x-boundary-timestamp"] = md_dict["boundary_timestamp"] - # add columns for `scd2` merge strategy + if merge_strategy == "scd2": + md_dict = cast(TScd2StrategyDict, md_dict) + if "boundary_timestamp" in md_dict: + dict_["x-boundary-timestamp"] = md_dict["boundary_timestamp"] if md_dict.get("validity_column_names") is None: from_, to = DEFAULT_VALIDITY_COLUMN_NAMES else: @@ -514,7 +516,8 @@ def validate_dynamic_hints(template: TResourceHints) -> None: ) @staticmethod - def validate_write_disposition_hint(wd: TTableHintTemplate[TWriteDispositionConfig]) -> None: + def validate_write_disposition_hint(template: TResourceHints) -> None: + wd = template.get("write_disposition") if isinstance(wd, dict) and wd["disposition"] == "merge": wd = cast(TMergeDispositionDict, wd) if "strategy" in wd and wd["strategy"] not in MERGE_STRATEGIES: @@ -523,13 +526,18 @@ def validate_write_disposition_hint(wd: TTableHintTemplate[TWriteDispositionConf f"""Allowed values: {', '.join(['"' + s + '"' for s in MERGE_STRATEGIES])}.""" ) - for ts in ("active_record_timestamp", "boundary_timestamp"): - if ts == "active_record_timestamp" and wd.get("active_record_timestamp") is None: - continue # None is allowed for active_record_timestamp - if ts in wd: - try: - ensure_pendulum_datetime(wd[ts]) # type: ignore[literal-required] - except Exception: - raise ValueError( - f'could not parse `{ts}` value "{wd[ts]}"' # type: ignore[literal-required] - ) + if wd.get("strategy") == "scd2": + wd = cast(TScd2StrategyDict, wd) + for ts in ("active_record_timestamp", "boundary_timestamp"): + if ( + ts == "active_record_timestamp" + and wd.get("active_record_timestamp") is None + ): + continue # None is allowed for active_record_timestamp + if ts in wd: + try: + ensure_pendulum_datetime(wd[ts]) # type: ignore[literal-required] + except Exception: + raise ValueError( + f'could not parse `{ts}` value "{wd[ts]}"' # type: ignore[literal-required] + ) diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 88f009e3c2..c8f92cf154 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -223,7 +223,7 @@ info = pipeline.run(fb_ads.with_resources("ads"), write_disposition="merge") In the example above, we enforce the root key propagation with `fb_ads.root_key = True`. This ensures that the correct data is propagated on the initial `replace` load so the future `merge` load can be executed. You can achieve the same in the decorator `@dlt.source(root_key=True)`. ### `scd2` strategy -`dlt` can create [Slowly Changing Dimension Type 2](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) (SCD2) destination tables for dimension tables that change in the source. The resource is expected to provide a full extract of the source table each run. A row hash is stored in `_dlt_id` and used as a surrogate key to identify source records that have been inserted, updated, or deleted. A `NULL` value is used by default to indicate an active record, but it's possible to use a configurable high timestamp (e.g., 9999-12-31 00:00:00.000000) instead. +`dlt` can create [Slowly Changing Dimension Type 2](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) (SCD2) destination tables for dimension tables that change in the source. By default, the resource is expected to provide a full extract of the source table each run, but [incremental extracts](#example-incremental-scd2) are also possible. A row hash is stored in `_dlt_id` and used as surrogate key to identify source records that have been inserted, updated, or deleted. A `NULL` value is used by default to indicate an active record, but it's possible to use a configurable high timestamp (e.g. 9999-12-31 00:00:00.000000) instead. :::note The `unique` hint for `_dlt_id` in the root table is set to `false` when using `scd2`. This differs from [default behavior](./destination-tables.md#child-and-parent-tables). The reason is that the surrogate key stored in `_dlt_id` contains duplicates after an _insert-delete-reinsert_ pattern: @@ -300,6 +300,131 @@ pipeline.run(dim_customer()) # third run — 2024-04-10 06:45:22.847403 | 2024-04-09 18:27:53.734235 | **2024-04-10 06:45:22.847403** | 2 | bar | 2 | | 2024-04-09 22:13:07.943703 | NULL | 1 | foo_updated | 1 | +#### Example: incremental `scd2` +A `merge_key` can be provided to work with incremental extracts instead of full extracts. The `merge_key` lets you define which absent rows are considered "deleted". Compound natural keys are allowed and can be specified by providing a list of column names as `merge_key`. + +*Case 1: do not retire absent records* + +You can set the natural key as `merge_key` to prevent retirement of absent rows. In this case you don't consider any absent row deleted. Records are not retired in the destination if their corresponding natural keys are not present in the source extract. This allows for incremental extracts that only contain updated records. + +```py +@dlt.resource( + merge_key="customer_key", + write_disposition={"disposition": "merge", "strategy": "scd2"} +) +def dim_customer(): + # initial load + yield [ + {"customer_key": 1, "c1": "foo", "c2": 1}, + {"customer_key": 2, "c1": "bar", "c2": 2} + ] + +pipeline.run(dim_customer()) # first run — 2024-04-09 18:27:53.734235 +... +``` +*`dim_customer` destination table after first run:* + +| `_dlt_valid_from` | `_dlt_valid_to` | `customer_key` | `c1` | `c2` | +| -- | -- | -- | -- | -- | +| 2024-04-09 18:27:53.734235 | NULL | 1 | foo | 1 | +| 2024-04-09 18:27:53.734235 | NULL | 2 | bar | 2 | + +```py +... +def dim_customer(): + # second load — record for customer_key 1 got updated, customer_key 2 absent + yield [ + {"customer_key": 1, "c1": "foo_updated", "c2": 1}, +] + +pipeline.run(dim_customer()) # second run — 2024-04-09 22:13:07.943703 +``` + +*`dim_customer` destination table after second run—customer key 2 was not retired:* + +| `_dlt_valid_from` | `_dlt_valid_to` | `customer_key` | `c1` | `c2` | +| -- | -- | -- | -- | -- | +| 2024-04-09 18:27:53.734235 | **2024-04-09 22:13:07.943703** | 1 | foo | 1 | +| 2024-04-09 18:27:53.734235 | NULL | 2 | bar | 2 | +| **2024-04-09 22:13:07.943703** | **NULL** | **1** | **foo_updated** | **1** | + +*Case 2: only retire records for given partitions* + +:::note +Technically this is not SCD2 because the key used to merge records is not a natural key. +::: + +You can set a "partition" column as `merge_key` to retire absent rows for given partitions. In this case you only consider absent rows deleted if their partition value is present in the extract. Physical partitioning of the table is not required—the word "partition" is used conceptually here. + +```py +@dlt.resource( + merge_key="date", + write_disposition={"disposition": "merge", "strategy": "scd2"} +) +def some_data(): + # load 1 — "2024-01-01" partition + yield [ + {"date": "2024-01-01", "name": "a"}, + {"date": "2024-01-01", "name": "b"}, + ] + +pipeline.run(some_data()) # first run — 2024-01-02 03:03:35.854305 +... +``` + +*`some_data` destination table after first run:* + +| `_dlt_valid_from` | `_dlt_valid_to` | `date` | `name` | +| -- | -- | -- | -- | +| 2024-01-02 03:03:35.854305 | NULL | 2024-01-01 | a | +| 2024-01-02 03:03:35.854305 | NULL | 2024-01-01 | b | + +```py +... +def some_data(): + # load 2 — "2024-01-02" partition + yield [ + {"date": "2024-01-02", "name": "c"}, + {"date": "2024-01-02", "name": "d"}, + ] + +pipeline.run(some_data()) # second run — 2024-01-03 03:01:11.943703 +... +``` + +*`some_data` destination table after second run—added 2024-01-02 records, did not touch 2024-01-01 records:* + +| `_dlt_valid_from` | `_dlt_valid_to` | `date` | `name` | +| -- | -- | -- | -- | +| 2024-01-02 03:03:35.854305 | NULL | 2024-01-01 | a | +| 2024-01-02 03:03:35.854305 | NULL | 2024-01-01 | b | +| **2024-01-03 03:01:11.943703** | **NULL** | **2024-01-02** | **c** | +| **2024-01-03 03:01:11.943703** | **NULL** | **2024-01-02** | **d** | + +```py +... +def some_data(): + # load 3 — reload "2024-01-01" partition + yield [ + {"date": "2024-01-01", "name": "a"}, # unchanged + {"date": "2024-01-01", "name": "bb"}, # new + ] + +pipeline.run(some_data()) # third run — 2024-01-03 10:30:05.750356 +... +``` + +*`some_data` destination table after third run—retired b, added bb, did not touch 2024-01-02 partition:* + +| `_dlt_valid_from` | `_dlt_valid_to` | `date` | `name` | +| -- | -- | -- | -- | +| 2024-01-02 03:03:35.854305 | NULL | 2024-01-01 | a | +| 2024-01-02 03:03:35.854305 | **2024-01-03 10:30:05.750356** | 2024-01-01 | b | +| 2024-01-03 03:01:11.943703 | NULL | 2024-01-02 | c | +| 2024-01-03 03:01:11.943703 | NULL | 2024-01-02 | d | +| **2024-01-03 10:30:05.750356** | **NULL** | **2024-01-01** | **bb** | + + #### Example: configure validity column names `_dlt_valid_from` and `_dlt_valid_to` are used by default as validity column names. Other names can be configured as follows: ```py diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index 0ecbbea89d..3f8ccfc20f 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -334,8 +334,8 @@ def test_typeddict_friendly_exceptions() -> None: wrong_dict["write_disposition"] = {"strategy": "scd2"} validate_dict(EndpointResource, wrong_dict, ".") print(e.value) - # Union of 3 types and callable - assert len(e.value.nested_exceptions) == 4 + # Union of 4 types and callable + assert len(e.value.nested_exceptions) == 5 # this has wrong disposition string with pytest.raises(DictValidationException) as e: @@ -343,8 +343,8 @@ def test_typeddict_friendly_exceptions() -> None: wrong_dict["write_disposition"] = "unknown" # type: ignore[assignment] validate_dict(EndpointResource, wrong_dict, ".") print(e.value) - # Union of 3 types and callable - assert len(e.value.nested_exceptions) == 4 + # Union of 4 types and callable + assert len(e.value.nested_exceptions) == 5 # this has wrong nested type with pytest.raises(DictValidationException) as e: diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py index c75ff4d3e6..3e08b792ed 100644 --- a/tests/load/pipeline/test_scd2.py +++ b/tests/load/pipeline/test_scd2.py @@ -9,13 +9,12 @@ from dlt.common.typing import TAnyDateTime from dlt.common.pendulum import pendulum from dlt.common.pipeline import LoadInfo -from dlt.common.schema.exceptions import ColumnNameConflictException +from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import DEFAULT_VALIDITY_COLUMN_NAMES from dlt.common.normalizers.json.relational import DataItemNormalizer from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention from dlt.common.time import ensure_pendulum_datetime, reduce_pendulum_datetime_precision from dlt.extract.resource import DltResource -from dlt.pipeline.exceptions import PipelineStepFailed from tests.cases import arrow_table_all_data_types from tests.load.utils import ( @@ -32,6 +31,7 @@ from tests.utils import TPythonTableFormat get_row_hash = DataItemNormalizer.get_row_hash +FROM, TO = DEFAULT_VALIDITY_COLUMN_NAMES def get_load_package_created_at(pipeline: dlt.Pipeline, load_info: LoadInfo) -> datetime: @@ -74,40 +74,21 @@ def get_table( @pytest.mark.essential @pytest.mark.parametrize( - "destination_config,simple,validity_column_names,active_record_timestamp", - # test basic cases for alle SQL destinations supporting merge - [ - (dconf, True, None, None) - for dconf in destinations_configs(default_sql_configs=True, supports_merge=True) - ] - + [ - (dconf, True, None, pendulum.DateTime(2099, 12, 31, 22, 2, 59)) # arbitrary timestamp - for dconf in destinations_configs(default_sql_configs=True, supports_merge=True) - ] - + [ # test nested columns and validity column name configuration only for postgres and duckdb - (dconf, False, ["from", "to"], None) - for dconf in destinations_configs(default_sql_configs=True, subset=["postgres", "duckdb"]) - ] - + [ - (dconf, False, ["ValidFrom", "ValidTo"], None) - for dconf in destinations_configs(default_sql_configs=True, subset=["postgres", "duckdb"]) - ], - ids=lambda x: ( - x.name - if isinstance(x, DestinationTestConfiguration) - else (x[0] + "-" + x[1] if isinstance(x, list) else x) - ), + "destination_config", + destinations_configs(default_sql_configs=True, supports_merge=True), + ids=lambda x: x.name, +) +@pytest.mark.parametrize( + "validity_column_names", + [None, ["from", "to"], ["ValidFrom", "ValidTo"]], + ids=lambda x: x[0] + "-" + x[1] if isinstance(x, list) else x, ) def test_core_functionality( destination_config: DestinationTestConfiguration, - simple: bool, validity_column_names: List[str], - active_record_timestamp: Optional[pendulum.DateTime], ) -> None: - # somehow destination_config comes through as ParameterSet instead of - # DestinationTestConfiguration - destination_config = destination_config.values[0] # type: ignore[attr-defined] - + if validity_column_names is not None and destination_config.destination_type != "postgres": + pytest.skip("test `validity_column_names` configuration only for `postgres`") p = destination_config.setup_pipeline("abstract", dev_mode=True) @dlt.resource( @@ -116,7 +97,6 @@ def test_core_functionality( "disposition": "merge", "strategy": "scd2", "validity_column_names": validity_column_names, - "active_record_timestamp": active_record_timestamp, }, ) def r(data): @@ -131,8 +111,8 @@ def r(data): # load 1 — initial load dim_snap = [ - {"nk": 1, "c1": "foo", "c2": "foo" if simple else {"nc1": "foo"}}, - {"nk": 2, "c1": "bar", "c2": "bar" if simple else {"nc1": "bar"}}, + {"nk": 1, "c1": "foo", "c2": {"nc1": "foo"}}, + {"nk": 2, "c1": "bar", "c2": {"nc1": "bar"}}, ] info = p.run(r(dim_snap), **destination_config.run_kwargs) assert_load_info(info) @@ -148,93 +128,92 @@ def r(data): # assert load results ts_1 = get_load_package_created_at(p, info) assert_load_info(info) - cname = "c2" if simple else "c2__nc1" - assert get_table(p, "dim_test", cname) == [ + assert get_table(p, "dim_test", "c2__nc1") == [ { from_: ts_1, - to: active_record_timestamp, + to: None, "nk": 2, "c1": "bar", - cname: "bar", + "c2__nc1": "bar", }, { from_: ts_1, - to: active_record_timestamp, + to: None, "nk": 1, "c1": "foo", - cname: "foo", + "c2__nc1": "foo", }, ] # load 2 — update a record dim_snap = [ - {"nk": 1, "c1": "foo", "c2": "foo_updated" if simple else {"nc1": "foo_updated"}}, - {"nk": 2, "c1": "bar", "c2": "bar" if simple else {"nc1": "bar"}}, + {"nk": 1, "c1": "foo", "c2": {"nc1": "foo_updated"}}, + {"nk": 2, "c1": "bar", "c2": {"nc1": "bar"}}, ] info = p.run(r(dim_snap), **destination_config.run_kwargs) ts_2 = get_load_package_created_at(p, info) assert_load_info(info) - assert get_table(p, "dim_test", cname) == [ + assert get_table(p, "dim_test", "c2__nc1") == [ { from_: ts_1, - to: active_record_timestamp, + to: None, "nk": 2, "c1": "bar", - cname: "bar", + "c2__nc1": "bar", }, - {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", cname: "foo"}, + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", "c2__nc1": "foo"}, { from_: ts_2, - to: active_record_timestamp, + to: None, "nk": 1, "c1": "foo", - cname: "foo_updated", + "c2__nc1": "foo_updated", }, ] # load 3 — delete a record dim_snap = [ - {"nk": 1, "c1": "foo", "c2": "foo_updated" if simple else {"nc1": "foo_updated"}}, + {"nk": 1, "c1": "foo", "c2": {"nc1": "foo_updated"}}, ] info = p.run(r(dim_snap), **destination_config.run_kwargs) ts_3 = get_load_package_created_at(p, info) assert_load_info(info) - assert get_table(p, "dim_test", cname) == [ - {from_: ts_1, to: ts_3, "nk": 2, "c1": "bar", cname: "bar"}, - {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", cname: "foo"}, + assert get_table(p, "dim_test", "c2__nc1") == [ + {from_: ts_1, to: ts_3, "nk": 2, "c1": "bar", "c2__nc1": "bar"}, + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", "c2__nc1": "foo"}, { from_: ts_2, - to: active_record_timestamp, + to: None, "nk": 1, "c1": "foo", - cname: "foo_updated", + "c2__nc1": "foo_updated", }, ] # load 4 — insert a record dim_snap = [ - {"nk": 1, "c1": "foo", "c2": "foo_updated" if simple else {"nc1": "foo_updated"}}, - {"nk": 3, "c1": "baz", "c2": "baz" if simple else {"nc1": "baz"}}, + {"nk": 1, "c1": "foo", "c2": {"nc1": "foo_updated"}}, + {"nk": 3, "c1": "baz", "c2": {"nc1": "baz"}}, ] info = p.run(r(dim_snap), **destination_config.run_kwargs) ts_4 = get_load_package_created_at(p, info) assert_load_info(info) - assert get_table(p, "dim_test", cname) == [ - {from_: ts_1, to: ts_3, "nk": 2, "c1": "bar", cname: "bar"}, + assert get_table(p, "dim_test", "c2__nc1") == [ + {from_: ts_1, to: ts_3, "nk": 2, "c1": "bar", "c2__nc1": "bar"}, { from_: ts_4, - to: active_record_timestamp, + to: None, "nk": 3, "c1": "baz", - cname: "baz", + "c2__nc1": "baz", }, - {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", cname: "foo"}, + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", "c2__nc1": "foo"}, { from_: ts_2, - to: active_record_timestamp, + to: None, "nk": 1, "c1": "foo", - cname: "foo_updated", + "c2__nc1": "foo_updated", }, ] @@ -255,9 +234,6 @@ def test_child_table(destination_config: DestinationTestConfiguration, simple: b def r(data): yield data - # get validity column names - from_, to = DEFAULT_VALIDITY_COLUMN_NAMES - # load 1 — initial load dim_snap: List[Dict[str, Any]] = [ l1_1 := {"nk": 1, "c1": "foo", "c2": [1] if simple else [{"cc1": 1}]}, @@ -267,8 +243,8 @@ def r(data): ts_1 = get_load_package_created_at(p, info) assert_load_info(info) assert get_table(p, "dim_test", "c1") == [ - {from_: ts_1, to: None, "nk": 2, "c1": "bar"}, - {from_: ts_1, to: None, "nk": 1, "c1": "foo"}, + {FROM: ts_1, TO: None, "nk": 2, "c1": "bar"}, + {FROM: ts_1, TO: None, "nk": 1, "c1": "foo"}, ] cname = "value" if simple else "cc1" assert get_table(p, "dim_test__c2", cname) == [ @@ -286,9 +262,9 @@ def r(data): ts_2 = get_load_package_created_at(p, info) assert_load_info(info) assert get_table(p, "dim_test", "c1") == [ - {from_: ts_1, to: None, "nk": 2, "c1": "bar"}, - {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, # updated - {from_: ts_2, to: None, "nk": 1, "c1": "foo_updated"}, # new + {FROM: ts_1, TO: None, "nk": 2, "c1": "bar"}, + {FROM: ts_1, TO: ts_2, "nk": 1, "c1": "foo"}, # updated + {FROM: ts_2, TO: None, "nk": 1, "c1": "foo_updated"}, # new ] assert_records_as_set( get_table(p, "dim_test__c2"), @@ -315,10 +291,10 @@ def r(data): assert_records_as_set( get_table(p, "dim_test"), [ - {from_: ts_1, to: None, "nk": 2, "c1": "bar"}, - {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, - {from_: ts_2, to: ts_3, "nk": 1, "c1": "foo_updated"}, # updated - {from_: ts_3, to: None, "nk": 1, "c1": "foo_updated"}, # new + {FROM: ts_1, TO: None, "nk": 2, "c1": "bar"}, + {FROM: ts_1, TO: ts_2, "nk": 1, "c1": "foo"}, + {FROM: ts_2, TO: ts_3, "nk": 1, "c1": "foo_updated"}, # updated + {FROM: ts_3, TO: None, "nk": 1, "c1": "foo_updated"}, # new ], ) exp_3 = [ @@ -341,10 +317,10 @@ def r(data): assert_records_as_set( get_table(p, "dim_test"), [ - {from_: ts_1, to: ts_4, "nk": 2, "c1": "bar"}, # updated - {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, - {from_: ts_2, to: ts_3, "nk": 1, "c1": "foo_updated"}, - {from_: ts_3, to: None, "nk": 1, "c1": "foo_updated"}, + {FROM: ts_1, TO: ts_4, "nk": 2, "c1": "bar"}, # updated + {FROM: ts_1, TO: ts_2, "nk": 1, "c1": "foo"}, + {FROM: ts_2, TO: ts_3, "nk": 1, "c1": "foo_updated"}, + {FROM: ts_3, TO: None, "nk": 1, "c1": "foo_updated"}, ], ) assert_records_as_set( @@ -362,11 +338,11 @@ def r(data): assert_records_as_set( get_table(p, "dim_test"), [ - {from_: ts_1, to: ts_4, "nk": 2, "c1": "bar"}, - {from_: ts_5, to: None, "nk": 3, "c1": "baz"}, # new - {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, - {from_: ts_2, to: ts_3, "nk": 1, "c1": "foo_updated"}, - {from_: ts_3, to: None, "nk": 1, "c1": "foo_updated"}, + {FROM: ts_1, TO: ts_4, "nk": 2, "c1": "bar"}, + {FROM: ts_5, TO: None, "nk": 3, "c1": "baz"}, # new + {FROM: ts_1, TO: ts_2, "nk": 1, "c1": "foo"}, + {FROM: ts_2, TO: ts_3, "nk": 1, "c1": "foo_updated"}, + {FROM: ts_3, TO: None, "nk": 1, "c1": "foo_updated"}, ], ) assert_records_as_set( @@ -519,13 +495,12 @@ def r(data): ts_3 = get_load_package_created_at(p, info) # assert parent records - from_, to = DEFAULT_VALIDITY_COLUMN_NAMES r1_no_child = {k: v for k, v in r1.items() if k != "child"} r2_no_child = {k: v for k, v in r2.items() if k != "child"} expected = [ - {**{from_: ts_1, to: ts_2}, **r1_no_child}, - {**{from_: ts_3, to: None}, **r1_no_child}, - {**{from_: ts_1, to: None}, **r2_no_child}, + {**{FROM: ts_1, TO: ts_2}, **r1_no_child}, + {**{FROM: ts_3, TO: None}, **r1_no_child}, + {**{FROM: ts_1, TO: None}, **r2_no_child}, ] assert_records_as_set(get_table(p, "dim_test"), expected) @@ -653,10 +628,9 @@ def r(data): info = p.run(r(dim_snap), **destination_config.run_kwargs) assert_load_info(info) assert load_table_counts(p, "dim_test")["dim_test"] == 2 - from_, to = DEFAULT_VALIDITY_COLUMN_NAMES expected = [ - {**{from_: strip_timezone(ts1), to: None}, **l1_1}, - {**{from_: strip_timezone(ts1), to: None}, **l1_2}, + {**{FROM: strip_timezone(ts1), TO: None}, **l1_1}, + {**{FROM: strip_timezone(ts1), TO: None}, **l1_2}, ] assert get_table(p, "dim_test", "nk") == expected @@ -677,10 +651,10 @@ def r(data): assert_load_info(info) assert load_table_counts(p, "dim_test")["dim_test"] == 4 expected = [ - {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_1}, # retired - {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_2}, # retired - {**{from_: strip_timezone(ts2), to: None}, **l2_1}, # new - {**{from_: strip_timezone(ts2), to: None}, **l2_3}, # new + {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_1}, # retired + {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_2}, # retired + {**{FROM: strip_timezone(ts2), TO: None}, **l2_1}, # new + {**{FROM: strip_timezone(ts2), TO: None}, **l2_3}, # new ] assert_records_as_set(get_table(p, "dim_test"), expected) @@ -699,10 +673,10 @@ def r(data): assert_load_info(info) assert load_table_counts(p, "dim_test")["dim_test"] == 4 expected = [ - {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_1}, # unchanged - {**{from_: strip_timezone(ts1), to: strip_timezone(ts2)}, **l1_2}, # unchanged - {**{from_: strip_timezone(ts2), to: None}, **l2_1}, # unchanged - {**{from_: strip_timezone(ts2), to: strip_timezone(ts3)}, **l2_3}, # retired + {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_1}, # unchanged + {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_2}, # unchanged + {**{FROM: strip_timezone(ts2), TO: None}, **l2_1}, # unchanged + {**{FROM: strip_timezone(ts2), TO: strip_timezone(ts3)}, **l2_3}, # retired ] assert_records_as_set(get_table(p, "dim_test"), expected) @@ -717,6 +691,196 @@ def r(data): ) +@pytest.mark.essential +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, supports_merge=True), + ids=lambda x: x.name, +) +def test_merge_key_natural_key( + destination_config: DestinationTestConfiguration, +) -> None: + p = destination_config.setup_pipeline("abstract", dev_mode=True) + + @dlt.resource( + merge_key="nk", + write_disposition={"disposition": "merge", "strategy": "scd2"}, + ) + def dim_test(data): + yield data + + # load 1 — initial load + dim_snap = [ + {"nk": 1, "foo": "foo"}, + {"nk": 2, "foo": "foo"}, + ] + info = p.run(dim_test(dim_snap), **destination_config.run_kwargs) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 2 + # both records should be active (i.e. not retired) + assert [row[TO] for row in get_table(p, "dim_test")] == [None, None] + + # load 2 — natural key 2 is absent, natural key 1 is unchanged + dim_snap = [ + {"nk": 1, "foo": "foo"}, + ] + info = p.run(dim_test(dim_snap), **destination_config.run_kwargs) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 2 + # both records should still be active + assert [row[TO] for row in get_table(p, "dim_test")] == [None, None] + + # load 3 — natural key 2 is absent, natural key 1 has changed + dim_snap = [ + {"nk": 1, "foo": "bar"}, + ] + info = p.run(dim_test(dim_snap), **destination_config.run_kwargs) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 3 + ts3 = get_load_package_created_at(p, info) + # natural key 1 should now have two records (one retired, one active) + actual = [{k: v for k, v in row.items() if k in ("nk", TO)} for row in get_table(p, "dim_test")] + expected = [{"nk": 1, TO: ts3}, {"nk": 1, TO: None}, {"nk": 2, TO: None}] + assert_records_as_set(actual, expected) # type: ignore[arg-type] + + # load 4 — natural key 2 is absent, natural key 1 has changed back to + # initial version + dim_snap = [ + {"nk": 1, "foo": "foo"}, + ] + info = p.run(dim_test(dim_snap), **destination_config.run_kwargs) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 4 + ts4 = get_load_package_created_at(p, info) + # natural key 1 should now have three records (two retired, one active) + actual = [{k: v for k, v in row.items() if k in ("nk", TO)} for row in get_table(p, "dim_test")] + expected = [{"nk": 1, TO: ts3}, {"nk": 1, TO: ts4}, {"nk": 1, TO: None}, {"nk": 2, TO: None}] + assert_records_as_set(actual, expected) # type: ignore[arg-type] + + +@pytest.mark.essential +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, supports_merge=True), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("key_type", ("text", "bigint")) +def test_merge_key_compound_natural_key( + destination_config: DestinationTestConfiguration, + key_type: TDataType, +) -> None: + p = destination_config.setup_pipeline("abstract", dev_mode=True) + + @dlt.resource( + merge_key=["first_name", "last_name"], + write_disposition={"disposition": "merge", "strategy": "scd2"}, + ) + def dim_test_compound(data): + yield data + + # vary `first_name` type to test mixed compound `merge_key` + if key_type == "text": + first_name = "John" + elif key_type == "bigint": + first_name = 1 # type: ignore[assignment] + # load 1 — initial load + dim_snap = [ + {"first_name": first_name, "last_name": "Doe", "age": 20}, + {"first_name": first_name, "last_name": "Dodo", "age": 20}, + ] + info = p.run(dim_test_compound(dim_snap), **destination_config.run_kwargs) + assert_load_info(info) + assert load_table_counts(p, "dim_test_compound")["dim_test_compound"] == 2 + # both records should be active (i.e. not retired) + assert [row[TO] for row in get_table(p, "dim_test_compound")] == [None, None] + + # load 2 — "Dodo" is absent, "Doe" has changed + dim_snap = [ + {"first_name": first_name, "last_name": "Doe", "age": 30}, + ] + info = p.run(dim_test_compound(dim_snap), **destination_config.run_kwargs) + assert_load_info(info) + assert load_table_counts(p, "dim_test_compound")["dim_test_compound"] == 3 + ts3 = get_load_package_created_at(p, info) + # "Doe" should now have two records (one retired, one active) + actual = [ + {k: v for k, v in row.items() if k in ("first_name", "last_name", TO)} + for row in get_table(p, "dim_test_compound") + ] + expected = [ + {"first_name": first_name, "last_name": "Doe", TO: ts3}, + {"first_name": first_name, "last_name": "Doe", TO: None}, + {"first_name": first_name, "last_name": "Dodo", TO: None}, + ] + assert_records_as_set(actual, expected) # type: ignore[arg-type] + + +@pytest.mark.essential +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, supports_merge=True), + ids=lambda x: x.name, +) +def test_merge_key_partition( + destination_config: DestinationTestConfiguration, +) -> None: + p = destination_config.setup_pipeline("abstract", dev_mode=True) + + @dlt.resource( + merge_key="date", + write_disposition={"disposition": "merge", "strategy": "scd2"}, + ) + def dim_test(data): + yield data + + # load 1 — "2024-01-01" partition + dim_snap = [ + {"date": "2024-01-01", "name": "a"}, + {"date": "2024-01-01", "name": "b"}, + ] + info = p.run(dim_test(dim_snap), **destination_config.run_kwargs) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 2 + # both records should be active (i.e. not retired) + assert [row[TO] for row in get_table(p, "dim_test")] == [None, None] + + # load 2 — "2024-01-02" partition + dim_snap = [ + {"date": "2024-01-02", "name": "c"}, + {"date": "2024-01-02", "name": "d"}, + ] + info = p.run(dim_test(dim_snap), **destination_config.run_kwargs) + assert_load_info(info) + assert load_table_counts(p, "dim_test")["dim_test"] == 4 + # two "2024-01-01" records should be untouched, two "2024-01-02" records should + # be added + assert [row[TO] for row in get_table(p, "dim_test")] == [None, None, None, None] + + # load 3 — reload "2024-01-01" partition + dim_snap = [ + {"date": "2024-01-01", "name": "a"}, # unchanged + {"date": "2024-01-01", "name": "bb"}, # new + ] + info = p.run(dim_test(dim_snap), **destination_config.run_kwargs) + assert_load_info(info) + # "b" should be retired, "bb" should be added, "2024-01-02" partition + # should be untouched + assert load_table_counts(p, "dim_test")["dim_test"] == 5 + ts2 = get_load_package_created_at(p, info) + actual = [ + {k: v for k, v in row.items() if k in ("date", "name", TO)} + for row in get_table(p, "dim_test") + ] + expected = [ + {"date": "2024-01-01", "name": "a", TO: None}, + {"date": "2024-01-01", "name": "b", TO: ts2}, + {"date": "2024-01-01", "name": "bb", TO: None}, + {"date": "2024-01-02", "name": "c", TO: None}, + {"date": "2024-01-02", "name": "d", TO: None}, + ] + assert_records_as_set(actual, expected) # type: ignore[arg-type] + + @pytest.mark.parametrize( "destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), @@ -750,9 +914,8 @@ def _make_scd2_r(table_: Any) -> DltResource: # make sure we have scd2 columns in schema table_schema = p.default_schema.get_table("tabular") assert table_schema["x-merge-strategy"] == "scd2" # type: ignore[typeddict-item] - from_, to = DEFAULT_VALIDITY_COLUMN_NAMES - assert table_schema["columns"][from_]["x-valid-from"] # type: ignore[typeddict-item] - assert table_schema["columns"][to]["x-valid-to"] # type: ignore[typeddict-item] + assert table_schema["columns"][FROM]["x-valid-from"] # type: ignore[typeddict-item] + assert table_schema["columns"][TO]["x-valid-to"] # type: ignore[typeddict-item] assert table_schema["columns"]["row_hash"]["x-row-version"] # type: ignore[typeddict-item] # 100 items in destination assert load_table_counts(p, "tabular")["tabular"] == 100 @@ -816,13 +979,12 @@ def r(data): ts_2 = get_load_package_created_at(p, info) # assert load results - from_, to = DEFAULT_VALIDITY_COLUMN_NAMES assert get_table(p, "dim_test", "c1") == [ - {from_: ts_1, to: ts_2, "nk": 2, "c1": "bar", "row_hash": "mocked_hash_2"}, - {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", "row_hash": "mocked_hash_1"}, + {FROM: ts_1, TO: ts_2, "nk": 2, "c1": "bar", "row_hash": "mocked_hash_2"}, + {FROM: ts_1, TO: ts_2, "nk": 1, "c1": "foo", "row_hash": "mocked_hash_1"}, { - from_: ts_2, - to: None, + FROM: ts_2, + TO: None, "nk": 1, "c1": "foo_upd", "row_hash": "mocked_hash_1_upd", From 93cd5a6aae1b86c96e18aba5081694a2bc98133a Mon Sep 17 00:00:00 2001 From: Emmanuel Ferdman Date: Mon, 30 Sep 2024 20:05:28 +0300 Subject: [PATCH 06/29] Update weaviate reference (#1896) Signed-off-by: Emmanuel Ferdman --- docs/website/docs/dlt-ecosystem/destinations/weaviate.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index cce54654b8..214cc3aa4b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -305,7 +305,7 @@ Below is an example that configures the **contextionary** vectorizer. You can pu vectorizer="text2vec-contextionary" module_config={text2vec-contextionary = { vectorizeClassName = false, vectorizePropertyName = true}} ``` -You can find Docker Compose with the instructions to run [here](https://github.com/dlt-hub/dlt/tree/devel/dlt/destinations/weaviate/README.md). +You can find Docker Compose with the instructions to run [here](https://github.com/dlt-hub/dlt/tree/devel/dlt/destinations/impl/weaviate/README.md). ### dbt support From 854905fb56576bc608b01b6b047208df888160a7 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Mon, 30 Sep 2024 19:12:08 +0200 Subject: [PATCH 07/29] Expand ENV abbreviation in the docs (#1846) --- .../dlt-ecosystem/destinations/destination.md | 2 +- .../file-formats/_set_the_format.mdx | 2 +- .../docs/general-usage/credentials/setup.md | 30 ++++++++--------- docs/website/docs/tutorial/filesystem.md | 4 +-- .../deploy-a-pipeline/deploy-with-dagster.md | 32 +++++++++---------- 5 files changed, 35 insertions(+), 35 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md index 7b1e1b23a4..a7f7c5fe16 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/destination.md +++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md @@ -145,7 +145,7 @@ There are multiple ways to pass the custom destination function to the `dlt` pip ) ) ``` -- Via a fully qualified string to function location (can be used from `config.toml` or ENV vars). The destination function should be located in another file. +- Via a fully qualified string to function location (this can be set in `config.toml` or through environment variables). The destination function should be located in another file. ```py # File my_pipeline.py diff --git a/docs/website/docs/dlt-ecosystem/file-formats/_set_the_format.mdx b/docs/website/docs/dlt-ecosystem/file-formats/_set_the_format.mdx index e2cce374a2..285f9b0264 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/_set_the_format.mdx +++ b/docs/website/docs/dlt-ecosystem/file-formats/_set_the_format.mdx @@ -16,7 +16,7 @@ info = pipeline.run(some_source(), loader_file_format="{props.file_type}") loader_file_format="{props.file_type}" -3. You can set the `loader_file_format` via ENV variable: +3. You can set the `loader_file_format` via environment variable:
 export NORMALIZE__LOADER_FILE_FORMAT="{props.file_type}"
diff --git a/docs/website/docs/general-usage/credentials/setup.md b/docs/website/docs/general-usage/credentials/setup.md
index 5f05e68b6d..4210ab5422 100644
--- a/docs/website/docs/general-usage/credentials/setup.md
+++ b/docs/website/docs/general-usage/credentials/setup.md
@@ -45,8 +45,8 @@ The most specific possible path for **sources** looks like:
   groupId="config-provider-type"
   defaultValue="toml"
   values={[
-    {"label": "Toml config provider", "value": "toml"},
-    {"label": "ENV variables", "value": "env"},
+    {"label": "TOML config provider", "value": "toml"},
+    {"label": "Environment variables", "value": "env"},
     {"label": "In the code", "value": "code"},
 ]}>
   
@@ -78,8 +78,8 @@ The most specific possible path for **destinations** looks like:
   groupId="config-provider-type"
   defaultValue="toml"
   values={[
-    {"label": "Toml config provider", "value": "toml"},
-    {"label": "ENV variables", "value": "env"},
+    {"label": "TOML config provider", "value": "toml"},
+    {"label": "Environment variables", "value": "env"},
     {"label": "In the code", "value": "code"},
 ]}>
   
@@ -285,8 +285,8 @@ Let's assume we have a [notion](../../dlt-ecosystem/verified-sources/notion) sou
   groupId="config-provider-type"
   defaultValue="toml"
   values={[
-    {"label": "Toml config provider", "value": "toml"},
-    {"label": "ENV variables", "value": "env"},
+    {"label": "TOML config provider", "value": "toml"},
+    {"label": "Environment variables", "value": "env"},
     {"label": "In the code", "value": "code"},
 ]}>
 
@@ -319,7 +319,7 @@ aws_secret_access_key = "1234567890_access_key" # copy the secret access key her
   
 
 ```sh
-# ENV vars are set up the same way both for configs and secrets
+# Environment variables are set up the same way both for configs and secrets
 export RUNTIME__LOG_LEVEL="INFO"
 export DESTINATION__FILESYSTEM__BUCKET_URL="s3://[your_bucket_name]"
 export NORMALIZE__DATA_WRITER__DISABLE_COMPRESSION="true"
@@ -376,8 +376,8 @@ Let's assume we use the `bigquery` destination and the `google_sheets` source. T
   groupId="config-provider-type"
   defaultValue="toml"
   values={[
-    {"label": "Toml config provider", "value": "toml"},
-    {"label": "ENV variables", "value": "env"},
+    {"label": "TOML config provider", "value": "toml"},
+    {"label": "Environment variables", "value": "env"},
     {"label": "In the code", "value": "code"},
 ]}>
 
@@ -424,8 +424,8 @@ os.environ["CREDENTIALS__PROJECT_ID"] = os.environ.get("GOOGLE_PROJECT_ID")
   groupId="config-provider-type"
   defaultValue="toml"
   values={[
-    {"label": "Toml config provider", "value": "toml"},
-    {"label": "ENV variables", "value": "env"},
+    {"label": "TOML config provider", "value": "toml"},
+    {"label": "Environment variables", "value": "env"},
     {"label": "In the code", "value": "code"},
 ]}>
 
@@ -506,8 +506,8 @@ Let's assume we have several different Google sources and destinations. We can u
   groupId="config-provider-type"
   defaultValue="toml"
   values={[
-    {"label": "Toml config provider", "value": "toml"},
-    {"label": "ENV variables", "value": "env"},
+    {"label": "TOML config provider", "value": "toml"},
+    {"label": "Environment variables", "value": "env"},
     {"label": "In the code", "value": "code"},
 ]}>
 
@@ -590,8 +590,8 @@ Let's assume we have several sources of the same type. How can we separate them
   groupId="config-provider-type"
   defaultValue="toml"
   values={[
-    {"label": "Toml config provider", "value": "toml"},
-    {"label": "ENV variables", "value": "env"},
+    {"label": "TOML config provider", "value": "toml"},
+    {"label": "Environment variables", "value": "env"},
     {"label": "In the code", "value": "code"},
 ]}>
 
diff --git a/docs/website/docs/tutorial/filesystem.md b/docs/website/docs/tutorial/filesystem.md
index 6d30eed3e6..b2555db39b 100644
--- a/docs/website/docs/tutorial/filesystem.md
+++ b/docs/website/docs/tutorial/filesystem.md
@@ -112,8 +112,8 @@ Let's specify the bucket URL and credentials. We can do this using the following
   groupId="config-provider-type"
   defaultValue="toml"
   values={[
-    {"label": "Toml config provider", "value": "toml"},
-    {"label": "ENV variables", "value": "env"},
+    {"label": "TOML config provider", "value": "toml"},
+    {"label": "Environment variables", "value": "env"},
     {"label": "In the code", "value": "code"},
 ]}>
 
diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md
index 14ac18b3e7..e27bb2966a 100644
--- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md
+++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-dagster.md
@@ -184,17 +184,17 @@ For a complete picture of Dagster's integration with dlt, please refer to their
 ### Frequently Asked Questions
 - **Can I remove the generated `.dlt` folder with `secrets.toml` and `config.toml` files?**
 
-  Yes. Since dlt is compatible with ENV variables, you can use this for secrets required by both Dagster and dlt.
-  
+  Yes. Since dlt is compatible with environment variables, you can use this for secrets required by both Dagster and dlt.
+
 - **I'm working with several sources – how can I best group these assets?**
 
   To effectively group assets in Dagster when working with multiple sources, use the `group_name` parameter in your `@dlt_assets` decorator. This helps organize and visualize assets related to a particular source or theme in the Dagster UI. Here’s a simplified example:
-  
+
   ```py
   import dlt
   from dagster_embedded_elt.dlt import dlt_assets
   from dlt_sources.google_analytics import google_analytics
-  
+
   # Define assets for the first Google Analytics source
   @dlt_assets(
       dlt_source=google_analytics(),
@@ -207,7 +207,7 @@ For a complete picture of Dagster's integration with dlt, please refer to their
   )
   def google_analytics_assets_1(context, dlt):
       yield from dlt.run(context=context)
-  
+
   # Define assets for the second Google Analytics source
   @dlt_assets(
       dlt_source=google_analytics(),
@@ -222,18 +222,18 @@ For a complete picture of Dagster's integration with dlt, please refer to their
       yield from dlt.run(context=context)
   ```
 
- 
-  
+
+
 - **How can I use `bigquery_adapter` with `@dlt_assets` in Dagster for partitioned tables?**
-   
-  To use `bigquery_adapter` with `@dlt_assets` in Dagster for partitioned tables, modify your resource setup to include `bigquery_adapter` with the partition parameter. Here's a quick example:  
-  
+
+  To use `bigquery_adapter` with `@dlt_assets` in Dagster for partitioned tables, modify your resource setup to include `bigquery_adapter` with the partition parameter. Here's a quick example:
+
   ```py
   import dlt
   from google.analytics import BetaAnalyticsDataClient
   from dlt.destinations.adapters import bigquery_adapter
   from dagster import dlt_asset
-  
+
   @dlt_asset
   def google_analytics_asset(context):
       # Configuration (replace with your actual values or parameters)
@@ -244,20 +244,20 @@ For a complete picture of Dagster's integration with dlt, please refer to their
       start_date = "2024-01-01"
       rows_per_page = 1000
       credentials = your_credentials
-  
+
       # Initialize Google Analytics client
       client = BetaAnalyticsDataClient(credentials=credentials.to_native_credentials())
-  
+
       # Fetch metadata
       metadata = get_metadata(client=client, property_id=property_id)
       resource_list = [metadata | metrics_table, metadata | dimensions_table]
-  
+
       # Configure and add resources to the list
       for query in queries:
           dimensions = query["dimensions"]
           if "date" not in dimensions:
               dimensions.append("date")
-  
+
           resource_name = query["resource_name"]
           resource_list.append(
               bigquery_adapter(
@@ -274,7 +274,7 @@ For a complete picture of Dagster's integration with dlt, please refer to their
                   partition="date"
               )
           )
-  
+
       return resource_list
   ```
 

From 2eb8cfe54eb4d6d3a595373ce2551f0ae6089d06 Mon Sep 17 00:00:00 2001
From: Violetta Mishechkina 
Date: Tue, 1 Oct 2024 15:00:58 +0200
Subject: [PATCH 08/29] Docs: Add sftp option for filesystem source (#1845)

---
 .../dlt-ecosystem/destinations/filesystem.md  |  5 ++-
 .../verified-sources/filesystem/basic.md      | 36 +++++++++++++++++--
 .../verified-sources/filesystem/index.md      |  7 ++--
 docs/website/docs/tutorial/filesystem.md      |  2 +-
 docs/website/sidebars.js                      |  2 +-
 5 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
index a456fa6e7d..2be382c326 100644
--- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
+++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md
@@ -302,7 +302,10 @@ sftp_gss_deleg_creds  # Delegate credentials with GSS-API, defaults to True
 sftp_gss_host         # Host for GSS-API, defaults to None
 sftp_gss_trust_dns    # Trust DNS for GSS-API, defaults to True
 ```
-> For more information about credentials parameters: https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
+
+:::info
+For more information about credentials parameters: https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
+:::
 
 ### Authentication methods
 
diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md
index 847ff64bf1..6eb02b4edf 100644
--- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md
+++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md
@@ -6,7 +6,7 @@ keywords: [readers source and filesystem, files, filesystem, readers source, clo
 import Header from '../_source-info-header.md';
 
-Filesystem source allows loading files from remote locations (AWS S3, Google Cloud Storage, Google Drive, Azure) or the local filesystem seamlessly. Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. +Filesystem source allows loading files from remote locations (AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage, SFTP server) or the local filesystem seamlessly. Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. To load unstructured data (`.pdf`, `.txt`, e-mail), please refer to the [unstructured data source](https://github.com/dlt-hub/verified-sources/tree/master/sources/unstructured_data). @@ -75,6 +75,7 @@ To get started with your data pipeline, follow these steps: {"label": "AWS S3", "value": "aws"}, {"label": "GCS/GDrive", "value": "gcp"}, {"label": "Azure", "value": "azure"}, + {"label": "SFTP", "value": "sftp"}, {"label": "Local filesystem", "value": "local"}, ]}> @@ -122,6 +123,18 @@ For more info, see + + +dlt supports several authentication methods: + +1. Key-based authentication +2. SSH Agent-based authentication +3. Username/Password authentication +4. GSS-API authentication + +Learn more about SFTP authentication options in [SFTP section](../../destinations/filesystem#sftp). To obtain credentials, contact your server administrator. + + You don't need any credentials for the local filesystem. @@ -143,6 +156,7 @@ a bucket, can be specified in `config.toml`. {"label": "AWS S3", "value": "aws"}, {"label": "GCS/GDrive", "value": "gcp"}, {"label": "Azure", "value": "azure"}, + {"label": "SFTP", "value": "sftp"}, {"label": "Local filesystem", "value": "local"}, ]}> @@ -195,6 +209,24 @@ bucket_url="gs:////" ``` + + +Learn how to set up SFTP credentials for each authentication method in the [SFTP section](../../destinations/filesystem#sftp). +For example, in case of key-based authentication, you can configure the source the following way: + +```toml +# secrets.toml +[sources.filesystem.credentials] +sftp_username = "foo" +sftp_key_filename = "/path/to/id_rsa" # Replace with the path to your private key file +sftp_key_passphrase = "your_passphrase" # Optional: passphrase for your private key + +# config.toml +[sources.filesystem] # use [sources.readers.credentials] for the "readers" source +bucket_url = "sftp://[hostname]/[path]" +``` + + You can use both native local filesystem paths and `file://` URI. Absolute, relative, and UNC Windows paths are supported. @@ -219,7 +251,7 @@ bucket_url='~\Documents\csv_files\' You can also specify the credentials using Environment variables. The name of the corresponding environment -variable should be slightly different than the corresponding name in the `toml` file. Simply replace dots `.` with double +variable should be slightly different from the corresponding name in the `toml` file. Simply replace dots `.` with double underscores `__`: ```sh diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md index 32e0df77c2..1441931340 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md @@ -1,6 +1,6 @@ --- -title: Filesystem & Buckets -description: dlt-verified source for Filesystem & Buckets +title: Filesystem & cloud storage +description: dlt-verified source for Filesystem & cloud storage keywords: [readers source and filesystem, files, filesystem, readers source, cloud storage] --- @@ -8,7 +8,8 @@ The Filesystem source allows seamless loading of files from the following locati * AWS S3 * Google Cloud Storage * Google Drive -* Azure +* Azure Blob Storage +* remote filesystem (via SFTP) * local filesystem The Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. diff --git a/docs/website/docs/tutorial/filesystem.md b/docs/website/docs/tutorial/filesystem.md index b2555db39b..f939cc1f4f 100644 --- a/docs/website/docs/tutorial/filesystem.md +++ b/docs/website/docs/tutorial/filesystem.md @@ -4,7 +4,7 @@ description: Learn how to load data files like JSON, JSONL, CSV, and Parquet fro keywords: [dlt, tutorial, filesystem, cloud storage, file system, python, data pipeline, incremental loading, json, jsonl, csv, parquet, duckdb] --- -This tutorial is for you if you need to load data files like JSONL, CSV, and Parquet from either Cloud Storage (e.g., AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage) or a local file system. +This tutorial is for you if you need to load data files like JSONL, CSV, and Parquet from either Cloud Storage (e.g., AWS S3, Google Cloud Storage, Google Drive, Azure Blob Storage), a remote (SFTP), or a local file system. ## What you will learn diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 7e6000a2ca..32bb554842 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -67,7 +67,7 @@ const sidebars = { { type: 'category', label: 'Filesystem & cloud storage', - description: 'AWS S3, Google Cloud Storage, Azure Blob Storage, local file system', + description: 'AWS S3, Google Cloud Storage, Azure, SFTP, local file system', link: { type: 'doc', id: 'dlt-ecosystem/verified-sources/filesystem/index', From e21ab01ba463b983c96937dafc78b4c783969083 Mon Sep 17 00:00:00 2001 From: erik james mason Date: Tue, 1 Oct 2024 06:06:42 -0700 Subject: [PATCH 09/29] Fix a typo in installation.md (#1899) --- docs/website/docs/reference/installation.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/reference/installation.md b/docs/website/docs/reference/installation.md index a19e01ae80..e64e691c20 100644 --- a/docs/website/docs/reference/installation.md +++ b/docs/website/docs/reference/installation.md @@ -109,7 +109,7 @@ C:\> .\env\Scripts\activate You can now install `dlt` in your virtual environment by running: ```sh -# install the newest dlt version or upgrade the exisint version to the newest one +# install the newest dlt version or upgrade the existing version to the newest one pip install -U dlt ``` @@ -143,4 +143,4 @@ You are now ready to build your first pipeline with `dlt`. Check out these tutor - [Load data from a SQL database](../tutorial/sql-database) - [Load data from a cloud storage or a file system](../tutorial/filesystem) -Or read a more detailed tutorial on how to build a [custom data pipeline with dlt](../tutorial/load-data-from-an-api.md). \ No newline at end of file +Or read a more detailed tutorial on how to build a [custom data pipeline with dlt](../tutorial/load-data-from-an-api.md). From cb9bbd96adc0fae8a88cef93e0f92bb5686613d6 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 1 Oct 2024 16:25:57 +0200 Subject: [PATCH 10/29] fix grammar pages 80-100 (#1906) * fix grammar pages 80-100 * Update docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md Co-authored-by: Violetta Mishechkina * Update docs/website/docs/dlt-ecosystem/verified-sources/freshdesk.md Co-authored-by: Violetta Mishechkina --------- Co-authored-by: Violetta Mishechkina --- .../verified-sources/airtable.md | 33 ++-- .../dlt-ecosystem/verified-sources/asana.md | 54 ++---- .../dlt-ecosystem/verified-sources/chess.md | 41 ++-- .../verified-sources/facebook_ads.md | 181 +++++++----------- .../verified-sources/freshdesk.md | 32 ++-- .../verified-sources/google_ads.md | 49 ++--- .../dlt-ecosystem/verified-sources/hubspot.md | 74 +++---- .../dlt-ecosystem/verified-sources/index.md | 7 +- .../dlt-ecosystem/verified-sources/mongodb.md | 55 +++--- .../dlt-ecosystem/verified-sources/mux.md | 19 +- .../verified-sources/pg_replication.md | 26 +-- .../verified-sources/salesforce.md | 103 ++++------ .../dlt-ecosystem/verified-sources/shopify.md | 86 ++++----- .../verified-sources/sql_database/index.md | 12 +- .../verified-sources/sql_database/setup.md | 13 +- .../sql_database/troubleshooting.md | 31 +-- .../verified-sources/sql_database/usage.md | 16 +- .../dlt-ecosystem/verified-sources/strapi.md | 9 +- .../verified-sources/workable.md | 76 ++++---- .../dlt-ecosystem/verified-sources/zendesk.md | 73 +++---- 20 files changed, 433 insertions(+), 557 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md index 43d99a02fd..a2e15bfd75 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/airtable.md @@ -14,20 +14,20 @@ data management and collaboration. This Airtable `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/airtable_pipeline.py) -loads data using “Airtable API” to the destination of your choice. +loads data using the “Airtable API” to the destination of your choice. Sources and resources that can be loaded using this verified source are: | Name | Description | | ----------------- |--------------------------------------------| | airtable_source | Retrieves tables from an Airtable base | -| airtable_resource | Retrives data from a single Airtable table | +| airtable_resource | Retrieves data from a single Airtable table | -## Setup Guide +## Setup guide ### Grab Airtable personal access tokens -1. Click your account icon top-right. +1. Click your account icon in the top-right. 1. Choose "Developer Hub" from the dropdown. 1. Select "Personal access token" on the left, then "Create new token". 1. Name it appropriately. @@ -90,16 +90,16 @@ For more information, read the guide on [how to add a verified source.](../../wa ```toml [sources.airtable] - access_token = "Please set me up!" # please set me up! + access_token = "Please set me up!" # Please set me up! ``` 1. Finally, enter credentials for your chosen destination as per the [docs](../destinations/). -1. Next you need to configure ".dlt/config.toml", which looks like: +1. Next, you need to configure ".dlt/config.toml", which looks like: ```toml [sources.airtable] - base_id = "Please set me up!" # The id of the base. + base_id = "Please set me up!" # The ID of the base. table_names = ["Table1","Table2"] # A list of table IDs or table names to load. ``` @@ -142,7 +142,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug ### Source `airtable_source` -This function retrieves tables from given Airtable base. +This function retrieves tables from a given Airtable base. ```py @dlt.source @@ -178,10 +178,11 @@ def airtable_resource( ## Customization + + ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -196,7 +197,7 @@ verified source. 1. To load the entire base: ```py - base_id = "Please set me up!" # The id of the base. + base_id = "Please set me up!" # The ID of the base. airtables = airtable_source(base_id=base_id) load_info = pipeline.run(load_data, write_disposition="replace") @@ -205,8 +206,8 @@ verified source. 1. To load selected tables from a base table: ```py - base_id = "Please set me up!" # The id of the base. - table_names = ["Table1","Table2"] # A list of table IDs or table names to load. + base_id = "Please set me up!" # The ID of the base. + table_names = ["Table1", "Table2"] # A list of table IDs or table names to load. airtables = airtable_source( base_id = base_id, @@ -221,14 +222,14 @@ verified source. 1. To load data and apply hints to a specific column: ```py - base_id = "Please set me up!" # The id of the base. - table_names = ["Table1","Table2"] # A list of table IDs or table names to load. + base_id = "Please set me up!" # The ID of the base. + table_names = ["Table1", "Table2"] # A list of table IDs or table names to load. resource_name = "Please set me up!" # The table name we want to apply hints. field_name = "Please set me up!" # The table field name for which we want to apply hints. airtables = airtable_source( base_id="Please set me up!", - table_names=["Table1","Table2"], + table_names=["Table1", "Table2"], ) airtables.resources[resource_name].apply_hints( diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md index 173cc42b8a..67e52596b2 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/asana.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/asana.md @@ -20,18 +20,18 @@ Resources that can be loaded using this verified source are: | Name | Description | | ---------- | ----------------------------------------------------------------------------------------------------- | -| workspaces | people, materials, or assets required to complete a task or project successfully | -| projects | collections of tasks and related information | -| sections | used to organize tasks within a project into smaller groups or categories | -| tags | labels that can be attached to tasks, projects, or conversations to help categorize and organize them | -| stories | updates or comments that team members can add to a task or project | -| teams | groups of individuals who work together to complete projects and tasks | -| users | individuals who have access to the Asana platform | +| workspaces | People, materials, or assets required to complete a task or project successfully | +| projects | Collections of tasks and related information | +| sections | Used to organize tasks within a project into smaller groups or categories | +| tags | Labels that can be attached to tasks, projects, or conversations to help categorize and organize them | +| stories | Updates or comments that team members can add to a task or project | +| teams | Groups of individuals who work together to complete projects and tasks | +| users | Individuals who have access to the Asana platform | To get a complete list of sub-endpoints that can be loaded, see [asana_dlt/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/asana_dlt/settings.py) -## Setup Guide +## Setup guide ### Grab credentials @@ -161,12 +161,9 @@ workspace from the iterator obtained. This enables the workspaces to be consumed ### Resource-transformer `projects` -In addition to these source and resource functions, there are seven transformer functions. For -various endpoints like “projects”, “sections”, “tags”, “tasks”, “stories”, “teams” and “users”. The -transformer functions transform or process data from one or more resources. +In addition to these source and resource functions, there are seven transformer functions for various endpoints like "projects", "sections", "tags", "tasks", "stories", "teams", and "users". The transformer functions transform or process data from one or more resources. -The transformer function `projects` process data from the `workspaces` resource. It -fetches and returns a list of projects for a given workspace from Asana. +The transformer function `projects` processes data from the `workspaces` resource. It fetches and returns a list of projects for a given workspace from Asana. ```py @dlt.transformer( @@ -184,18 +181,15 @@ def projects( `workspace`: The data item from the 'workspaces' resource. -`access_token`: Token required to authenticate the Asana API. This token is defined in the -`.dlt/secret.toml` file. +`access_token`: Token required to authenticate the Asana API. This token is defined in the `.dlt/secret.toml` file. -`fields`: A list of workspace fields to be fetched from `asana_dlt/settings.py`. For example, -"name", "members", "completed", etc. +`fields`: A list of workspace fields to be fetched from `asana_dlt/settings.py`. For example, "name", "members", "completed", etc. -It uses `@dlt.defer` decorator to enable parallel run in thread pool. +It uses the `@dlt.defer` decorator to enable parallel run in a thread pool. ### Resource-transformer `tasks` -This [incremental](../../general-usage/incremental-loading.md) resource-transformer fetches all -tasks for a given project from Asana. +This [incremental](../../general-usage/incremental-loading.md) resource-transformer fetches all tasks for a given project from Asana. ```py @dlt.transformer(data_from=projects, write_disposition="merge", primary_key="gid") @@ -212,23 +206,19 @@ def tasks( `workspace`: The data item from the 'projects' resource. -`access_token`: Token required to authenticate the Asana API. This token is defined in the -`.dlt/secret.toml` file. +`access_token`: Token required to authenticate the Asana API. This token is defined in the `.dlt/secret.toml` file. `modified_at`: The date from which to fetch modified tasks. -`fields`: A list of workspace fields to be fetched from `asana_dlt/settings.py`. For example, -"name", "assignee", "completed", etc. +`fields`: A list of workspace fields to be fetched from `asana_dlt/settings.py`. For example, "name", "assignee", "completed", etc. ## Customization ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. -To create your data pipeline using single loading for “workspaces” and “projects” endpoints, follow -these steps: +To create your data pipeline using single loading for the "workspaces" and "projects" endpoints, follow these steps: 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -240,10 +230,9 @@ these steps: ) ``` - To read more about pipeline configuration, please refer to our - [documentation](../../general-usage/pipeline). + To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). -1. To load the data from all the fields, you can utilise the `asana_source` method as follows: +1. To load the data from all the fields, you can utilize the `asana_source` method as follows: ```py load_data = asana_source() @@ -257,8 +246,7 @@ these steps: print(load_info) ``` -1. To use the method `pipeline.run()` to load custom endpoints “workspaces” and “projects”, the - above script may be modified as: +1. To use the method `pipeline.run()` to load custom endpoints "workspaces" and "projects", the above script may be modified as: ```py load_info = pipeline.run(load_data.with_resources("workspaces", "projects")) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md index 663dda7259..378eedaf62 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/chess.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/chess.md @@ -16,11 +16,11 @@ Resources that can be loaded using this verified source are: | Name | Description | | ---------------- | ---------------------------------------------------------------------- | -| players_profiles | retrives player profiles for a list of player usernames | -| players_archives | retrives url to game archives for specified players | -| players_games | retrives players games that happened between start_month and end_month | +| players_profiles | retrieves player profiles for a list of player usernames | +| players_archives | retrieves URL to game archives for specified players | +| players_games | retrieves players' games that happened between start_month and end_month | -## Setup Guide +## Setup guide ### Grab credentials @@ -93,7 +93,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug ### Source `source` This is a `dlt.source` function for the Chess.com API named "chess", which returns a sequence of -DltResource objects. That we'll discuss in subsequent sections as resources. +DltResource objects. We'll discuss these in subsequent sections as resources. ```py dlt.source(name="chess") @@ -129,13 +129,13 @@ def players_profiles(players: List[str]) -> Iterator[TDataItem]: yield _get_profile(username) ``` -`players`: Is a list of player usernames for which you want to fetch profile data. +`players`: This is a list of player usernames for which you want to fetch profile data. -It uses `@dlt.defer` decorator to enable parallel run in thread pool. +It uses the `@dlt.defer` decorator to enable parallel run in a thread pool. ### Resource `players_archives` -This is a `dlt.resource` function, which returns url to game archives for specified players. +This is a `dlt.resource` function, which returns a URL to game archives for specified players. ```py @dlt.resource(write_disposition="replace", selected=False) @@ -143,9 +143,9 @@ def players_archives(players: List[str]) -> Iterator[List[TDataItem]]: ... ``` -`players`: Is a list of player usernames for which you want to fetch archives. +`players`: This is a list of player usernames for which you want to fetch archives. -`selected=False`: Parameter means that this resource is not selected by default when the pipeline +`selected=False`: This parameter means that this resource is not selected by default when the pipeline runs. ### Resource `players_games` @@ -158,28 +158,29 @@ specified otherwise. def players_games( players: List[str], start_month: str = None, end_month: str = None ) -> Iterator[TDataItems]: - # gets a list of already checked(loaded) archives. + # gets a list of already checked (loaded) archives. checked_archives = dlt.current.resource_state().setdefault("archives", []) yield {} # return your retrieved data here ``` -`players`: Is a list of player usernames for which you want to fetch games. +`players`: This is a list of player usernames for which you want to fetch games. -List `checked_archives` is used to load new archives and skip the ones already loaded. It uses state +The list `checked_archives` is used to load new archives and skip the ones already loaded. It uses state to initialize a list called "checked_archives" from the current resource [state](../../general-usage/state). ### Resource `players_online_status` -The `players_online_status` is a `dlt.resource` function checks current online status of multiple chess players. It +The `players_online_status` is a `dlt.resource` function that checks the current online status of multiple chess players. It retrieves their username, status, last login date, and check time. ## Customization + + ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. To create your data loading pipeline for players and load data, follow these steps: @@ -193,10 +194,9 @@ To create your data loading pipeline for players and load data, follow these ste ) ``` - To read more about pipeline configuration, please refer to our - [documentation](../../general-usage/pipeline). + To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). -1. To load the data from all the resources for specific players (e.g. for November), you can utilise the `source` method as follows: +1. To load the data from all the resources for specific players (e.g., for November), you can utilize the `source` method as follows: ```py # Loads games for Nov 2022 @@ -215,8 +215,7 @@ To create your data loading pipeline for players and load data, follow these ste print(info) ``` -1. To load data from specific resources like "players_games" and "player_profiles", modify the above - code as: +1. To load data from specific resources like "players_games" and "player_profiles", modify the above code as: ```py info = pipeline.run(data.with_resources("players_games", "players_profiles")) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md index c9b1ee5e34..e559922c6d 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/facebook_ads.md @@ -5,7 +5,7 @@ keywords: [facebook ads api, verified source, facebook ads] --- import Header from './_source-info-header.md'; -# Facebook Ads +# Facebook ads
@@ -14,23 +14,23 @@ Facebook and its affiliated apps like Instagram and Messenger. This Facebook `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads_pipeline.py) -loads data using [Facebook Marketing API](https://developers.facebook.com/products/marketing-api/) to the destination of your choice. +loads data using the [Facebook Marketing API](https://developers.facebook.com/products/marketing-api/) to the destination of your choice. The endpoints that this verified source supports are: | Name | Description | | ----------------- | ------------------------------------------------------------------------------ | -| campaigns | a structured marketing initiative that focuses on a specific objective or goal | -| ad_sets | a subset or group of ads within a campaign | -| ads | individual advertisement that is created and displayed within an ad set | -| creatives | visual and textual elements that make up an advertisement | -| ad_leads | information collected from users who have interacted with lead generation ads | -| facebook_insights | data on audience demographics, post reach, and engagement metrics | +| campaigns | A structured marketing initiative that focuses on a specific objective or goal | +| ad_sets | A subset or group of ads within a campaign | +| ads | An individual advertisement that is created and displayed within an ad set | +| creatives | Visual and textual elements that make up an advertisement | +| ad_leads | Information collected from users who have interacted with lead generation ads | +| facebook_insights | Data on audience demographics, post reach, and engagement metrics | To get a complete list of sub-endpoints that can be loaded, see [facebook_ads/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads/settings.py) -## Setup Guide +## Setup guide ### Grab credentials @@ -38,7 +38,7 @@ To get a complete list of sub-endpoints that can be loaded, see 1. Ensure that you have Ads Manager active for your Facebook account. 1. Find your account ID, which is a long number. You can locate it by clicking on the Account - Overview dropdown in Ads Manager or by checking the link address. For example + Overview dropdown in Ads Manager or by checking the link address. For example, https://adsmanager.facebook.com/adsmanager/manage/accounts?act=10150974068878324. 1. Note this account ID as it will further be used in configuring dlt. @@ -57,31 +57,25 @@ To get a complete list of sub-endpoints that can be loaded, see short-lived access token. 1. Copy the access token and update it in the `.dlt/secrets.toml` file. -#### Exchange _short-lived token_ for a _long-lived token_: +#### Exchange short-lived token for a long-lived token -By default, Facebook access tokens have a short lifespan of one hour. To exchange a short-lived -Facebook access token for a long-lived token, update the `.dlt/secrets.toml` with client_id, and -client_secret and execute the provided Python code. +By default, Facebook access tokens have a short lifespan of one hour. To exchange a short-lived Facebook access token for a long-lived token, update the `.dlt/secrets.toml` with client_id and client_secret, and execute the provided Python code. ```py from facebook_ads import get_long_lived_token print(get_long_lived_token("your short-lived token")) ``` -Replace the `access_token` in the `.dlt/secrets.toml` file with the long-lived token obtained from -the above code snippet. +Replace the `access_token` in the `.dlt/secrets.toml` file with the long-lived token obtained from the above code snippet. -To retrieve the expiry date and the associated scopes of the token, you can use the following -command: +To retrieve the expiry date and the associated scopes of the token, you can use the following command: ```py from facebook_ads import debug_access_token debug_access_token() ``` -We highly recommend you to add the token expiration timestamp to get notified a week before token -expiration that you need to rotate it. Right now the notifications are sent to logger with error -level. In `config.toml` / `secrets.toml`: +We highly recommend you add the token expiration timestamp to get notified a week before token expiration that you need to rotate it. Right now, the notifications are sent to the logger with error level. In `config.toml` / `secrets.toml`: ```toml [sources.facebook_ads] @@ -91,7 +85,6 @@ access_token_expires_at=1688821881 > Note: The Facebook UI, which is described here, might change. The full guide is available at [this link.](https://developers.facebook.com/docs/marketing-apis/overview/authentication) - ### Initialize the verified source To get started with your data pipeline, follow these steps: @@ -102,24 +95,17 @@ To get started with your data pipeline, follow these steps: dlt init facebook_ads duckdb ``` - [This command](../../reference/command-line-interface) will initialize - [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads_pipeline.py) - with Facebook Ads as the [source](../../general-usage/source) and - [duckdb](../destinations/duckdb.md) as the [destination](../destinations). + [This command](../../reference/command-line-interface) will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads_pipeline.py) with Facebook Ads as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). -1. If you'd like to use a different destination, simply replace `duckdb` with the name of your - preferred [destination](../destinations). +1. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../destinations). -1. After running this command, a new directory will be created with the necessary files and - configuration settings to get started. +1. After running this command, a new directory will be created with the necessary files and configuration settings to get started. For more information, read the guide on [how to add a verified source](../../walkthroughs/add-a-verified-source). ### Add credential -1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can - securely store your access tokens and other sensitive information. It's important to handle this - file with care and keep it safe. Here's what the file looks like: +1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. Here's what the file looks like: ```toml # put your secret values and credentials here @@ -128,15 +114,11 @@ For more information, read the guide on [how to add a verified source](../../wal access_token="set me up!" ``` -1. Replace the access_token value with the [previously copied one](facebook_ads.md#grab-credentials) - to ensure secure access to your Facebook Ads resources. +1. Replace the access_token value with the [previously copied one](facebook_ads.md#grab-credentials) to ensure secure access to your Facebook Ads resources. -1. Next, Follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to - add credentials for your chosen destination, ensuring proper routing of your data to the final - destination. +1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to add credentials for your chosen destination, ensuring proper routing of your data to the final destination. -1. It is strongly recommended to add the token expiration timestamp to your `config.toml` or - `secrets.toml` file. +1. It is strongly recommended to add the token expiration timestamp to your `config.toml` or `secrets.toml` file. 1. Next, store your pipeline configuration details in the `.dlt/config.toml`. @@ -158,16 +140,16 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```sh pip install -r requirements.txt ``` -1. You're now ready to run the pipeline! To get started, run the following command: +2. You're now ready to run the pipeline! To get started, run the following command: ```sh python facebook_ads_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `facebook_ads`, you may also + For example, the `pipeline_name` for the above pipeline example is `facebook_ads`. You may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). @@ -181,12 +163,12 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug You can write your own pipelines to load data to a destination using this verified source. However, it is important to note the complete list of the default endpoints given in -[facebook_ads/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads_dlt/settings.py) +[facebook_ads/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads/settings.py) ### Source `facebook_ads_source` This function returns a list of resources to load campaigns, ad sets, ads, creatives, and ad leads -data from Facebook Marketing API. +data from the Facebook Marketing API. ```py @dlt.source(name="facebook_ads") @@ -200,18 +182,18 @@ def facebook_ads_source( ... ``` -`account_id`: Account id associated with add manager, configured in "config.toml". +`account_id`: Account ID associated with the ad manager, configured in "config.toml". `access_token`: Access token associated with the Business Facebook App, configured in "secrets.toml". -`chunk_size`: A size of the page and batch request. You may need to decrease it if you request a lot +`chunk_size`: The size of the page and batch request. You may need to decrease it if you request a lot of fields. Defaults to 50. `request_timeout`: Connection timeout. Defaults to 300.0. -`app_api_version`: A version of the facebook api required by the app for which the access tokens -were issued i.e. 'v17.0'. Defaults to the _facebook_business_ library default version. +`app_api_version`: A version of the Facebook API required by the app for which the access tokens +were issued, e.g., 'v17.0'. Defaults to the _facebook_business_ library default version. ### Resource `ads` @@ -228,7 +210,7 @@ def ads( yield get_data_chunked(account.get_ads, fields, states, chunk_size) ``` -`fields`: Retrives fields for each ad. For example, “id”, “name”, “adset_id” etc. +`fields`: Retrieves fields for each ad. For example, “id”, “name”, “adset_id”, etc. `states`: The possible states include "Active," "Paused," "Pending Review," "Disapproved," "Completed," and "Archived." @@ -240,15 +222,15 @@ Similar to resource `ads`, the following resources have been defined in the `__i | Resource | Description | | ------------ | -------------------------------------------------------------------- | -| campaigns | fetches all `DEFAULT_CAMPAIGN_FIELDS` | -| ad_sets | fetches all `DEFAULT_ADSET_FIELDS` | -| leads | fetches all `DEFAULT_LEAD_FIELDS`, uses `@dlt.transformer` decorator | -| ad_creatives | fetches all `DEFAULT_ADCREATIVE_FIELDS` | +| campaigns | Fetches all `DEFAULT_CAMPAIGN_FIELDS` | +| ad_sets | Fetches all `DEFAULT_ADSET_FIELDS` | +| leads | Fetches all `DEFAULT_LEAD_FIELDS`, uses `@dlt.transformer` decorator | +| ad_creatives | Fetches all `DEFAULT_ADCREATIVE_FIELDS` | The default fields are defined in [facebook_ads/settings.py](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads/settings.py) -### Source `facebook_insights_source`: +### Source `facebook_insights_source` This function returns a list of resources to load facebook_insights. @@ -272,47 +254,35 @@ def facebook_insights_source( ... ``` -`account_id`: Account id associated with ads manager, configured in _config.toml_. +`account_id`: Account ID associated with ads manager, configured in _config.toml_. -`access_token`: Access token associated with the Business Facebook App, configured in -_secrets.toml_. +`access_token`: Access token associated with the Business Facebook App, configured in _secrets.toml_. -`initial_load_past_days`: How many past days (starting from today) to initially load. Defaults to -30\. +`initial_load_past_days`: How many past days (starting from today) to initially load. Defaults to 30. -`fields`: A list of fields to include in each report. Note that the “breakdowns” option adds fields -automatically. Defaults to DEFAULT_INSIGHT_FIELDS. +`fields`: A list of fields to include in each report. Note that the “breakdowns” option adds fields automatically. Defaults to DEFAULT_INSIGHT_FIELDS. -`attribution_window_days_lag`: Attribution window in days. The reports in the attribution window are -refreshed on each run. Defaults to 7. +`attribution_window_days_lag`: Attribution window in days. The reports in the attribution window are refreshed on each run. Defaults to 7. -`time_increment_days`: The report aggregation window in days. use 7 for weekly aggregation. Defaults -to 1. +`time_increment_days`: The report aggregation window in days. Use 7 for weekly aggregation. Defaults to 1. -`breakdowns`: A presents with common aggregations. See -[settings.py](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads/settings.py) -for details. Defaults to "ads_insights_age_and_gender". +`breakdowns`: Presents with common aggregations. See [settings.py](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads/settings.py) for details. Defaults to "ads_insights_age_and_gender". -`action_breakdowns`: Action aggregation types. See -[settings.py](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads/settings.py) -for details. Defaults to ALL_ACTION_BREAKDOWNS. +`action_breakdowns`: Action aggregation types. See [settings.py](https://github.com/dlt-hub/verified-sources/blob/master/sources/facebook_ads/settings.py) for details. Defaults to ALL_ACTION_BREAKDOWNS. `level`: The granularity level. Defaults to "ad". -`action_attribution_windows`: Attribution windows for actions. Defaults to -ALL_ACTION_ATTRIBUTION_WINDOWS. +`action_attribution_windows`: Attribution windows for actions. Defaults to ALL_ACTION_ATTRIBUTION_WINDOWS. `batch_size`: Page size when reading data from a particular report. Defaults to 50. `request_timeout`: Connection timeout. Defaults to 300. -`app_api_version`: A version of the Facebook API required by the app for which the access tokens -were issued i.e. 'v17.0'. Defaults to the facebook_business library default version. +`app_api_version`: A version of the Facebook API required by the app for which the access tokens were issued, e.g., 'v17.0'. Defaults to the facebook_business library default version. ### Resource `facebook_insights` -This function fetches Facebook insights data incrementally from a specified start date until the -current date, in day steps. +This function fetches Facebook insights data incrementally from a specified start date until the current date, in day steps. ```py @dlt.resource(primary_key=INSIGHTS_PRIMARY_KEY, write_disposition="merge") @@ -324,16 +294,13 @@ def facebook_insights( ... ``` -`date_start`: Parameter sets the initial value for the "date_start" parameter in -dlt.sources.incremental. It is based on the last pipeline run or defaults to today's date minus the -specified number of days in the "initial_load_past_days" parameter. +`date_start`: Parameter sets the initial value for the "date_start" parameter in dlt.sources.incremental. It is based on the last pipeline run or defaults to today's date minus the specified number of days in the "initial_load_past_days" parameter. ## Customization ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -345,10 +312,9 @@ verified source. ) ``` - To read more about pipeline configuration, please refer to our - [documentation](../../general-usage/pipeline). + To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). -1. To load all the data from, campaigns, ad sets, ads, ad creatives and leads. +1. To load all the data from campaigns, ad sets, ads, ad creatives, and leads: ```py load_data = facebook_ads_source() @@ -356,15 +322,14 @@ verified source. print(load_info) ``` -1. To merge the Facebook Ads with the state “DISAPPROVED” and with ads state “PAUSED” you can do the - following: +1. To merge the Facebook Ads with the state "DISAPPROVED" and with ads state "PAUSED", you can do the following: ```py load_data = facebook_ads_source() - # It is recommended to enable root key propagation on a source that is not a merge one by default. this is not required if you always use merge but below we start with replace + # It is recommended to enable root key propagation on a source that is not a merge one by default. This is not required if you always use merge but below we start with replace load_data.root_key = True - # load only disapproved ads + # Load only disapproved ads load_data.ads.bind(states=("DISAPPROVED",)) load_info = pipeline.run(load_data.with_resources("ads"), write_disposition="replace") print(load_info) @@ -376,24 +341,19 @@ verified source. print(load_info) ``` - In the above steps, we first load the “ads” data with the “DISAPPROVED” state in _replace_ mode - and then merge the ads data with the “PAUSED” state on that. + In the above steps, we first load the "ads" data with the "DISAPPROVED" state in _replace_ mode and then merge the ads data with the "PAUSED" state on that. -1. To load data with a custom field, for example, to load only “id” from Facebook ads, you can do - the following: +1. To load data with a custom field, for example, to load only "id" from Facebook ads, you can do the following: ```py load_data = facebook_ads_source() - # Only loads add ids, works the same for campaigns, leads etc. + # Only loads ad ids, works the same for campaigns, leads, etc. load_data.ads.bind(fields=("id",)) load_info = pipeline.run(load_data.with_resources("ads")) print(load_info) ``` -1. This pipeline includes an enrichment transformation called `enrich_ad_objects` that you can apply - to any resource to obtain additional data per object using `object.get_api`. The following code - demonstrates how to enrich objects by adding an enrichment transformation that includes - additional fields. +1. This pipeline includes an enrichment transformation called `enrich_ad_objects` that you can apply to any resource to obtain additional data per object using `object.get_api`. The following code demonstrates how to enrich objects by adding an enrichment transformation that includes additional fields. ```py # You can reduce the chunk size for smaller requests @@ -414,34 +374,25 @@ verified source. print(load_info) ``` - In the above code, the "load_data" object represents the Facebook Ads source, and we specify the - desired chunk size for the requests. We then bind the "id" field for the "ad_creatives" resource - using the "bind()" method. + In the above code, the "load_data" object represents the Facebook Ads source, and we specify the desired chunk size for the requests. We then bind the "id" field for the "ad_creatives" resource using the "bind()" method. - To enrich the ad_creatives objects, we add a transformation using the "add_step()" method. The - "enrich_ad_objects" function is used to specify the AdCreative object type and request the fields - defined in _DEFAULT_ADCREATIVE_FIELDS_. + To enrich the ad_creatives objects, we add a transformation using the "add_step()" method. The "enrich_ad_objects" function is used to specify the AdCreative object type and request the fields defined in _DEFAULT_ADCREATIVE_FIELDS_. - Finally, we run the pipeline with the ad_creatives resource and store the load information in the - `load_info`. + Finally, we run the pipeline with the ad_creatives resource and store the load information in the `load_info`. -1. You can also load insights reports incrementally with defined granularity levels, fields, - breakdowns, etc. As defined in the `facebook_insights_source`. This function generates daily - reports for a specified number of past days. +1. You can also load insights reports incrementally with defined granularity levels, fields, breakdowns, etc., as defined in the `facebook_insights_source`. This function generates daily reports for a specified number of past days. ```py load_data = facebook_insights_source( initial_load_past_days=30, - attribution_window_days_lag= 7, + attribution_window_days_lag=7, time_increment_days=1 ) load_info = pipeline.run(load_data) print(load_info) ``` -> By default, daily reports are generated from `initial_load_past_days` ago to today. On subsequent -> runs, only new reports are loaded, with the past `attribution_window_days_lag` days (default is 7) -> being refreshed to accommodate any changes. You can adjust `time_increment_days` to change report -> frequency (default set to one). +> By default, daily reports are generated from `initial_load_past_days` ago to today. On subsequent runs, only new reports are loaded, with the past `attribution_window_days_lag` days (default is 7) being refreshed to accommodate any changes. You can adjust `time_increment_days` to change report frequency (default set to one). + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/freshdesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/freshdesk.md index 8990af83cb..63c26de670 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/freshdesk.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/freshdesk.md @@ -23,14 +23,14 @@ Resources that can be loaded using this verified source are: | S.No. | Name | Description | | ----- | --------- | ----------------------------------------------------------------------------------------- | -| 1. | agents | Users responsible for managing and resolving customer inquiries and support tickets. | -| 2. | companies | Customer organizations or groups that agents support. | -| 3. | contacts | Individuals or customers who reach out for support. | -| 4. | groups | Agents organized based on specific criteria. | -| 5. | roles | Predefined sets of permissions that determine what actions an agent can perform. | -| 6. | tickets | Customer inquiries or issues submitted via various channels like email, chat, phone, etc. | +| 1. | agents | Users responsible for managing and resolving customer inquiries and support tickets. | +| 2. | companies | Customer organizations or groups that agents support. | +| 3. | contacts | Individuals or customers who reach out for support. | +| 4. | groups | Agents organized based on specific criteria. | +| 5. | roles | Predefined sets of permissions that determine what actions an agent can perform. | +| 6. | tickets | Customer inquiries or issues submitted via various channels like email, chat, phone, etc. | -## Setup Guide +## Setup guide ### Grab credentials @@ -76,8 +76,8 @@ For more information, read the guide on [how to add a verified source](../../wal # Put your secret values and credentials here # Github access token (must be classic for reactions source) [sources.freshdesk] - domain = "please set me up!" # Enter the freshdesk domain here - api_secret_key = "please set me up!" # Enter the freshdesk API key here + domain = "please set me up!" # Enter the Freshdesk domain here + api_secret_key = "please set me up!" # Enter the Freshdesk API key here ``` 1. In the `domain`, enter the domain of your Freshdesk account. @@ -90,17 +90,17 @@ For more information, read the guide on [how to add a verified source](../../wal ```sh pip install -r requirements.txt ``` -1. You're now ready to run the pipeline! To get started, run the following command: +2. You're now ready to run the pipeline! To get started, run the following command: ```sh python freshdesk_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show ``` For example, the `pipeline_name` for the above pipeline example is - `freshdesk_pipeline`, you may also use any custom name instead. + `freshdesk_pipeline`. You may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). @@ -111,7 +111,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug ### Source `freshdesk_source` -This function retrives the data from specified Freshdesk API endpoints. +This function retrieves the data from specified Freshdesk API endpoints. ```py @dlt.source() @@ -160,7 +160,7 @@ def freshdesk_source( `write_disposition`: Specifies the write disposition to load data. -`primary_key`: Specifies "id" as primary key of the resource. +`primary_key`: Specifies "id" as the primary key of the resource. ## Customization ### Create your own pipeline @@ -180,7 +180,7 @@ verified source. To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). -1. To load data from all the endpoints, specified in ["settings.py".](https://github.com/dlt-hub/verified-sources/blob/master/sources/freshdesk/settings.py) +2. To load data from all the endpoints, specified in ["settings.py".](https://github.com/dlt-hub/verified-sources/blob/master/sources/freshdesk/settings.py) ```py load_data = freshdesk_source() # Run the pipeline @@ -189,7 +189,7 @@ verified source. print(load_info) ``` -1. To load the data from "agents", "contacts", and "tickets": +3. To load the data from "agents", "contacts", and "tickets": ```py load_data = freshdesk_source().with_resources("agents", "contacts", "tickets") # Run the pipeline diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_ads.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_ads.md index ae6df133ef..5e8b247ffd 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_ads.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_ads.md @@ -5,20 +5,15 @@ keywords: [google ads api, google ads verified source, google ads] --- import Header from './_source-info-header.md'; -# Google Ads +# Google ads -[Google Ads](https://ads.google.com/home/) is a digital advertising service by Google that allows advertisers -to display ads across Google's search results, websites, and other platforms. +[Google Ads](https://ads.google.com/home/) is a digital advertising service by Google that allows advertisers to display ads across Google's search results, websites, and other platforms. :::warning Alert! -Please note that we are unable to conduct regular testing on the specified source due to difficulties -in obtaining the necessary credentials. We confirmed this source works at creation, and it is being used by the community. -We anticipate that the source should operate smoothly over time given Google's best pratices in versioning apis. +Please note that we are unable to conduct regular testing on the specified source due to difficulties in obtaining the necessary credentials. We confirmed this source works at creation, and it is being used by the community. We anticipate that the source should operate smoothly over time given Google's best practices in versioning APIs. ::: -This Google Ads `dlt` verified source and -[pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/google_ads_pipeline.py) -loads data using the "Google Ads API" to the destination of your choice. +This Google Ads `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/google_ads_pipeline.py) loads data using the "Google Ads API" to the destination of your choice. Resources that can be loaded using this verified source are: @@ -29,42 +24,33 @@ Resources that can be loaded using this verified source are: | change_events | Modifications made to an account's ads, campaigns, and related settings | | customer_clients | Accounts that are managed by a given account | -## Setup Guide +## Setup guide ### Grab credentials -To access Google Ads verified sources, you'll need a developer token. For instructions on obtaining -one, you can search online or ask GPT. +To access Google Ads verified sources, you'll need a developer token. For instructions on obtaining one, you can search online or ask GPT. Next, there are two methods to get authenticated for using this verified source: - OAuth credentials - Service account credentials -Let's go over how to set up both OAuth tokens and service account credentials. In general, OAuth -tokens are preferred when user consent is required, while service account credentials are better -suited for server-to-server interactions. You can choose the method of authentication as per your -requirement. +Let's go over how to set up both OAuth tokens and service account credentials. In general, OAuth tokens are preferred when user consent is required, while service account credentials are better suited for server-to-server interactions. You can choose the method of authentication as per your requirement. ### Grab Google service account credentials -You need to create a GCP service account to get API credentials if you don't have one. To create -one, follow these steps: +You need to create a GCP service account to get API credentials if you don't have one. To create one, follow these steps: 1. Sign in to [console.cloud.google.com](http://console.cloud.google.com/). -1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#creating) if - needed. +1. [Create a service account](https://cloud.google.com/iam/docs/service-accounts-create#creating) if needed. -1. Enable the "Google Ads API". Refer to the - [Google documentation](https://support.google.com/googleapi/answer/6158841?hl=en) for - comprehensive instructions on this process. +1. Enable the "Google Ads API". Refer to the [Google documentation](https://support.google.com/googleapi/answer/6158841?hl=en) for comprehensive instructions on this process. 1. Generate credentials: 1. Navigate to IAM & Admin in the console's left panel, and then select Service Accounts. - 1. Identify the service account you intend to use, and click on the three-dot menu under the - "Actions" column next to it. + 1. Identify the service account you intend to use, and click on the three-dot menu under the "Actions" column next to it. 1. Create a new JSON key by selecting "Manage Keys" > "ADD KEY" > "CREATE". 1. You can download the ".json" file containing the necessary credentials for future use. @@ -110,7 +96,7 @@ python google_ads/setup_script_gcp_oauth.py Once you have executed the script and completed the authentication, you will receive a "refresh token" that can be used to set up the "secrets.toml". -### Share the Google Ads Account with the API: +### Share the Google Ads account with the API: :::note For service account authentication, use the client_email. For OAuth authentication, use the @@ -162,8 +148,8 @@ For more information, read the guide on [how to add a verified source](../../wal ### Add credentials 1. In the `.dlt` folder, there's a file called `secrets.toml`. It's where you store sensitive - information securely, like access tokens. Keep this file safe. In this file setup the "developer - token", "customer ID" and "impersonated_email" as follows: + information securely, like access tokens. Keep this file safe. In this file, set up the "developer + token", "customer ID", and "impersonated_email" as follows: ```toml [sources.google_ads] dev_token = "please set me up!" @@ -275,11 +261,11 @@ def customers( """ ``` -`client`: refers to a Google API Resource object used to interact with Google services. +`client`: Refers to a Google API Resource object used to interact with Google services. -`customer_id`: Individual identifier for google ads account. +`customer_id`: Individual identifier for a Google Ads account. -Similarly, there are resource functions called `campaigns`, `change_events` and `customer_clients` that populate +Similarly, there are resource functions called `campaigns`, `change_events`, and `customer_clients` that populate respective dimensions. ## Customization @@ -318,3 +304,4 @@ verified source. ``` + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md index 83077270c7..02c651a603 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/hubspot.md @@ -10,18 +10,18 @@ import Header from './_source-info-header.md';
HubSpot is a customer relationship management (CRM) software and inbound marketing platform that -helps businesses to attract visitors, engage customers, and close leads. +helps businesses attract visitors, engage customers, and close leads. -This Hubspot `dlt` verified source and +This HubSpot `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/hubspot_pipeline.py) -loads data using “Hubspot API” to the destination of your choice. +loads data using the “HubSpot API” to the destination of your choice. | Name | Description | | -------------------------- | ---------------------------------------------------------------------- | | contacts | visitors, potential customers, leads | | companies | information about organizations | | deals | deal records, deal tracking | -| tickets | request for help from customers or users | +| tickets | requests for help from customers or users | | products | pricing information of a product | | quotes | price proposals that salespeople can create and send to their contacts | | hubspot_events_for_objects | web analytics events for a given object type and object ids | @@ -29,12 +29,12 @@ loads data using “Hubspot API” to the destination of your choice. To get details about endpoints that can be loaded, see [hubspot/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/hubspot/settings.py) -## Setup Guide +## Setup guide ### Grab credentials > Note: As of November 30, 2022, HubSpot API Keys are being deprecated and are no longer supported. -Instead, we recommend to authenticate using a private app access token or OAuth access token. +Instead, we recommend authenticating using a private app access token or OAuth access token. Create a private app and get an authentication token before running the [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/hubspot_pipeline.py). @@ -61,7 +61,7 @@ Follow these steps: 1. Click "Show token" and store it for ".dlt/secrets.toml". -> Note: The Hubspot UI, which is described here, might change. +> Note: The HubSpot UI, which is described here, might change. The full guide is available at [this link.](https://knowledge.hubspot.com/integrations/how-do-i-get-my-hubspot-api-key) @@ -77,7 +77,7 @@ To get started with your data pipeline, follow these steps: [This command](../../reference/command-line-interface) will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/hubspot_pipeline.py) - with Hubspot as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) + with HubSpot as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). 1. If you'd like to use a different destination, simply replace `duckdb` with the name of your @@ -90,19 +90,16 @@ For more information, read the guide on [how to add a verified source](../../wal ### Add credentials -1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can - securely store your access tokens and other sensitive information. It's important to handle this - file with care and keep it safe. Here's what the file looks like: +1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. Here's what the file looks like: ```toml # put your secret values and credentials here - # do not share this file and do not push it to github + # do not share this file and do not push it to GitHub [sources.hubspot] api_key = "api_key" # please set me up! ``` -1. Replace the access_token value with the [previously copied one](hubspot.md#grab-credentials) to - ensure secure access to your Hubspot resources. +1. Replace the access_token value with the [previously copied one](hubspot.md#grab-credentials) to ensure secure access to your Hubspot resources. 1. Enter credentials for your chosen destination as per the [docs](../destinations/). @@ -110,8 +107,7 @@ For more information, read the [General Usage: Credentials.](../../general-usage ## Run the pipeline -1. Before running the pipeline, ensure that you have installed all the necessary dependencies by - running the command: +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: ```sh pip install -r requirements.txt ``` @@ -119,31 +115,25 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```sh python hubspot_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using - the following command: +1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `hubspot_pipeline`, you may - also use any custom name instead. + For example, the `pipeline_name` for the above pipeline example is `hubspot_pipeline`, you may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources -`dlt` works on the principle of [sources](../../general-usage/source) and -[resources](../../general-usage/resource). +`dlt` works on the principle of [sources](../../general-usage/source) and [resources](../../general-usage/resource). ### Default endpoints -You can write your own pipelines to load data to a destination using this verified source. However, -it is important to note the complete list of the default endpoints given in -[hubspot/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/hubspot/settings.py) +You can write your own pipelines to load data to a destination using this verified source. However, it is important to note the complete list of the default endpoints given in [hubspot/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/hubspot/settings.py) ### Source `hubspot` -This function returns a list of resources to load companies, contacts, deals, tickets, products, and -web analytics events data into the destination. +This function returns a list of resources to load companies, contacts, deals, tickets, products, and web analytics events data into the destination. ```py @dlt.source(name="hubspot") @@ -156,13 +146,11 @@ def hubspot( `api_key`: The key used to authenticate with the HubSpot API. Configured in "secrets.toml". -`include_history`: This parameter, when set to "True", loads the history of property changes for the -specified entities. +`include_history`: This parameter, when set to "True", loads the history of property changes for the specified entities. ### Resource `companies` -This resource function fetches data from the "companies" endpoint and loads it to -the destination, replacing any existing data. +This resource function fetches data from the "companies" endpoint and loads it to the destination, replacing any existing data. ```py @dlt.resource(name="companies", write_disposition="replace") @@ -182,16 +170,11 @@ def companies( ) ``` -This resource function takes the same arguments, `api_key` and `include_history` as the "husbpot" -source described [above](hubspot.md#source-hubspot), but also supports two additional. -`include_custom_props` - indicates if all the properties of CRM objects, except Hubspot driven -(prefixed with `hs_`), are to be extracted. `props` - the list of properties to extract -in addition to the custom properties. Similar to this, resource functions "contacts", -"deals", "tickets", "products", and "quotes" retrieve data from the Hubspot API. +This resource function takes the same arguments, `api_key` and `include_history` as the "hubspot" source described [above](hubspot.md#source-hubspot), but also supports two additional parameters. `include_custom_props` - indicates if all the properties of CRM objects, except Hubspot driven (prefixed with `hs_`), are to be extracted. `props` - the list of properties to extract in addition to the custom properties. Similar to this, resource functions "contacts", "deals", "tickets", "products", and "quotes" retrieve data from the Hubspot API. ### Resource `hubspot_events_for_objects` -This function loads web analytics events for specific objects from Hubspot API into the destination. +This function loads web analytics events for specific objects from the Hubspot API into the destination. ```py @dlt.resource @@ -207,11 +190,11 @@ def hubspot_events_for_objects( `object_type`: One of the Hubspot object types as defined in [hubspot/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/hubspot/settings.py). -`object_ids`: List of object ids to track events. +`object_ids`: List of object IDs to track events. `api_key`: The key used to authenticate with the HubSpot API. Configured in "secrets.toml". -`start_date`: The initial date time from which start getting events, default to "01-01-2000", +`start_date`: The initial date time from which to start getting events, default to "01-01-2000", configured in [hubspot/settings.py.](https://github.com/dlt-hub/verified-sources/blob/master/sources/hubspot/settings.py). @@ -243,14 +226,14 @@ verified source. print(load_info) ``` -1. To load data from contacts and companies, with time history using "with_resources" method. +1. To load data from contacts and companies, with time history using the "with_resources" method. ```py load_data = hubspot(include_history=True).with_resources("companies","contacts") load_info = pipeline.run(load_data) print(load_info) ``` - 1. `include_history` loads property change history and entities as separate tables. By default set as False. + 1. `include_history` loads property change history and entities as separate tables. By default, it is set as False. 1. By default, all the custom properties of a CRM object are extracted. If you want only particular fields, set the flag `include_custom_props=False` and add a list of properties with the `props` arg. @@ -261,7 +244,7 @@ verified source. load_info = pipeline.run(load_data.with_resources("contacts")) ``` -1. If you want to read all the custom properties of CRM objects and some additional (e.g. Hubspot driven) properties. +1. If you want to read all the custom properties of CRM objects and some additional (e.g., Hubspot driven) properties. ```py load_data = hubspot() @@ -274,7 +257,7 @@ verified source. ```py resource = hubspot_events_for_objects("company", ["7086461639", "7086464459"]) - # Here, object type : company, and object ids : 7086461639 and 7086464459 + # Here, object type: company, and object IDs: 7086461639 and 7086464459 load_info = pipeline.run([resource]) print(load_info) ``` @@ -286,7 +269,7 @@ verified source. ### Additional info If you encounter the following error while processing your request: :::warning ERROR -Your request to HubSpot is too long to process. Maximum allowed query length is 2000 symbols, ... while your list is +Your request to HubSpot is too long to process. The maximum allowed query length is 2000 symbols, while your list is 2125 symbols long. ::: @@ -305,3 +288,4 @@ info = p.run(hubspot(include_custom_props=False)) Or, if you wish to include them, you can modify `settings.py`. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/index.md index a3d2ba00a7..d37b4393d4 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/index.md @@ -20,7 +20,7 @@ item => item.label === '30+ SQL Databases' || item.label === 'REST APIs' || item Choose from our collection of verified sources, developed and maintained by the `dlt` team and community. Each source is rigorously tested against a real API and provided as Python code for easy customization. :::tip -If you couldn't find a source implementation, you can easily create your own, check out the [resource page](../../general-usage/resource) to learn how! +If you couldn't find a source implementation, you can easily create your own. Check out the [resource page](../../general-usage/resource) to learn how! ::: item.label !== '30+ SQL Databases' && item.label !== 'REST APIs' && item The main difference between the [core sources](#core-sources) and [verified sources](#verified-sources) lies in their structure. Core sources are generic collections, meaning they can connect to a variety of systems. For example, the [SQL Database source](sql_database) can connect to any -database which supports SQLAlchemy. +database that supports SQLAlchemy. According to our telemetry, core sources are the most widely used among our users! @@ -45,4 +45,5 @@ your working directory. * Source missing? [Request a new verified source.](https://github.com/dlt-hub/verified-sources/issues/new?template=source-request.md) * Missing endpoint or a feature? [Request or contribute](https://github.com/dlt-hub/verified-sources/issues/new?template=extend-a-source.md) -* [Join our Slack community](https://dlthub.com/community) and ask in the technical-help channel. \ No newline at end of file +* [Join our Slack community](https://dlthub.com/community) and ask in the technical-help channel. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md index 0a6ba8c632..9225797773 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md @@ -14,16 +14,16 @@ documents. This MongoDB `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/mongodb_pipeline.py) -loads data using "MongoDB" source to the destination of your choice. +loads data using the "MongoDB" source to the destination of your choice. Sources and resources that can be loaded using this verified source are: | Name | Description | |--------------------|--------------------------------------------| -| mongodb | loads a specific MongoDB database | -| mongodb_collection | loads a collection from a MongoDB database | +| mongodb | Loads a specific MongoDB database | +| mongodb_collection | Loads a collection from a MongoDB database | -## Setup Guide +## Setup guide ### Grab credentials @@ -42,22 +42,22 @@ Here are the typical ways to configure MongoDB and their connection URLs: | Name | Description | Connection URL Example | |---------------------|---------------------------------------------------------------------------------------|---------------------------------------------------| -| Local Installation | Install on Windows, macOS, Linux using official packages. | "mongodb://dbuser:passwd@host.or.ip:27017" | +| Local installation | Install on Windows, macOS, Linux using official packages. | "mongodb://dbuser:passwd@host.or.ip:27017" | | Docker | Deploy using the MongoDB Docker image. | "mongodb://dbuser:passwd@docker.host:27017" | | MongoDB Atlas | MongoDB’s managed service on AWS, Azure, and Google Cloud. | "mongodb+srv://dbuser:passwd@cluster.mongodb.net" | -| Managed Cloud | AWS DocumentDB, Azure Cosmos DB, and others offer MongoDB as a managed database. | "mongodb://dbuser:passwd@managed.cloud:27017" | -| Configuration Tools | Use Ansible, Chef, or Puppet for automation of setup and configuration. | "mongodb://dbuser:passwd@config.tool:27017" | -| Replica Set | Set up for high availability with data replication across multiple MongoDB instances. | "mongodb://dbuser:passwd@replica.set:27017" | -| Sharded Cluster | Scalable distribution of datasets across multiple MongoDB instances. | "mongodb://dbuser:passwd@shard.cluster:27017" | +| Managed cloud | AWS DocumentDB, Azure Cosmos DB, and others offer MongoDB as a managed database. | "mongodb://dbuser:passwd@managed.cloud:27017" | +| Configuration tools | Use Ansible, Chef, or Puppet for automation of setup and configuration. | "mongodb://dbuser:passwd@config.tool:27017" | +| Replica set | Set up for high availability with data replication across multiple MongoDB instances. | "mongodb://dbuser:passwd@replica.set:27017" | +| Sharded cluster | Scalable distribution of datasets across multiple MongoDB instances. | "mongodb://dbuser:passwd@shard.cluster:27017" | | Kubernetes | Deploy on Kubernetes using Helm charts or operators. | "mongodb://dbuser:passwd@k8s.cluster:27017" | -| Manual Tarball | Install directly from the official MongoDB tarball, typically on Linux. | "mongodb://dbuser:passwd@tarball.host:27017" | +| Manual tarball | Install directly from the official MongoDB tarball, typically on Linux. | "mongodb://dbuser:passwd@tarball.host:27017" | > Note: The provided URLs are example formats; adjust as needed for your specific setup. #### Grab `database and collections` -1. To grab "database and collections" you must have MongoDB shell installed. For installation - guidance, refer to [documentation here.](https://www.mongodb.com/docs/mongodb-shell/install/) +1. To grab "database and collections," you must have the MongoDB shell installed. For installation + guidance, refer to [the documentation here.](https://www.mongodb.com/docs/mongodb-shell/install/) 1. Modify the example URLs with your credentials (dbuser & passwd) and host details. @@ -67,19 +67,19 @@ Here are the typical ways to configure MongoDB and their connection URLs: mongo "mongodb://dbuser:passwd@your_host:27017" ``` -1. List all Databases: +1. List all databases: ```sh show dbs ``` -1. View Collections in a Database: +1. View collections in a database: - 1. Switch to Database: + 1. Switch to the database: ```sh use your_database_name ``` - 1. Display its Collections: + 1. Display its collections: ```sh show collections ``` @@ -90,7 +90,7 @@ Here are the typical ways to configure MongoDB and their connection URLs: exit ``` ->Note the database and collection names for future source configuration. +> Note the database and collection names for future source configuration. ### Prepare your data @@ -239,10 +239,10 @@ def mongodb_collection( ## Customization + ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -262,7 +262,7 @@ verified source. print(load_info) ``` -1. To load a specific collections from the database: +1. To load specific collections from the database: ```py load_data = mongodb().with_resources("collection_1", "collection_2") @@ -274,12 +274,12 @@ verified source. ```py load_data = mongodb(incremental=dlt.sources.incremental("date")).with_resources("collection_1") - load_info = pipeline.run(load_data, write_disposition = "merge") + load_info = pipeline.run(load_data, write_disposition="merge") print(load_info) ``` - > Data is loaded incrementally based on "date" field. + > Data is loaded incrementally based on the "date" field. -1. To load data from a particular collection say "movies" incrementally: +1. To load data from a particular collection, say "movies," incrementally: ```py load_data = mongodb_collection( @@ -293,7 +293,7 @@ verified source. ``` > The source function "mongodb_collection" loads data from a particular single - > collection, where as source "mongodb" can load data from multiple collections. + > collection, whereas the source "mongodb" can load data from multiple collections. > This script configures incremental loading from the "movies" collection based on the > "lastupdated" field, starting from midnight on September 10, 2020. @@ -311,7 +311,7 @@ verified source. ``` - > It applies hint for incremental loading based on the "last_scraped" field, ideal for tables + > It applies a hint for incremental loading based on the "last_scraped" field, ideal for tables > with additions but no updates. 1. To load a selected collection and rename it in the destination: @@ -330,15 +330,16 @@ verified source. 1. To load a selected collection, using Apache Arrow for data conversion: ```py - # Load collection "movies", using Apache Arrow for converion + # Load collection "movies", using Apache Arrow for conversion movies = mongodb_collection( collection="movies", data_item_format="arrow", ) # Run the pipeline - info = pipeline.run(source) + info = pipeline.run(movies) print(info) ``` + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md index 37368110e4..2ae14de2dc 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mux.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mux.md @@ -14,7 +14,7 @@ import Header from './_source-info-header.md'; This Mux `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/mux_pipeline.py) -loads data using “Mux API” to the destination of your choice. +loads data using the “Mux API” to the destination of your choice. | Name | Description | @@ -24,7 +24,7 @@ loads data using “Mux API” to the destination of your choice. > Note: The source `mux_source` loads all video assets, but each video view is for yesterday only! -## Setup Guide +## Setup guide ### Grab credentials @@ -74,7 +74,7 @@ For more information, read the guide on [how to add a verified source.](../../wa Here's what the file looks like: ```toml - # Put your secret values and credentials here. Do not share this file and do not push it to github + # Put your secret values and credentials here. Do not share this file and do not push it to GitHub [sources.mux] mux_api_access_token = "please set me up" # Mux API access token mux_api_secret_key = "please set me up!" # Mux API secret key @@ -94,11 +94,11 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```sh pip install -r requirements.txt ``` -1. You're now ready to run the pipeline! To get started, run the following command: +2. You're now ready to run the pipeline! To get started, run the following command: ```sh python mux_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show @@ -159,7 +159,7 @@ def views_resource( ... ``` -The arguments `mux_api_access_token`, `mux_api_secret_key` and `limit` are the same as described [above](#resource-assets_resource) in "asset_resource". +The arguments `mux_api_access_token`, `mux_api_secret_key`, and `limit` are the same as described [above](#resource-assets_resource) in "asset_resource". ## Customization @@ -178,21 +178,21 @@ verified source. ) ``` -1. To load metadata about every asset to be loaded: +2. To load metadata about every asset to be loaded: ```py load_info = pipeline.run(mux_source().with_resources("assets_resource")) print(load_info) ``` -1. To load data for each video view from yesterday: +3. To load data for each video view from yesterday: ```py load_info = pipeline.run(mux_source().with_resources("views_resource")) print(load_info) ``` -1. To load both metadata about assets and video views from yesterday: +4. To load both metadata about assets and video views from yesterday: ```py load_info = pipeline.run(mux_source()) @@ -200,3 +200,4 @@ verified source. ``` + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md b/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md index 7934dd0067..d3ed47905f 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md @@ -18,13 +18,13 @@ Resources that can be loaded using this verified source are: | replication_resource | Load published messages from a replication slot | :::info -The postgres replication source currently **does not** suppport the [scd2 merge strategy](../../general-usage/incremental-loading#scd2-strategy). +The Postgres replication source currently **does not** support the [scd2 merge strategy](../../general-usage/incremental-loading#scd2-strategy). ::: -## Setup Guide +## Setup guide ### Setup user -To setup a Postgres user follow these steps: +To set up a Postgres user, follow these steps: 1. The Postgres user needs to have the `LOGIN` and `REPLICATION` attributes assigned: @@ -40,17 +40,17 @@ To setup a Postgres user follow these steps: ### Set up RDS -To setup a Postgres user on RDS follow these steps: +To set up a Postgres user on RDS, follow these steps: -1. You must enable replication for RDS Postgres instance via [Parameter Group](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_PostgreSQL.Replication.ReadReplicas.html) +1. You must enable replication for the RDS Postgres instance via [Parameter Group](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_PostgreSQL.Replication.ReadReplicas.html). -2. `WITH LOGIN REPLICATION;` does not work on RDS, instead do: +2. `WITH LOGIN REPLICATION;` does not work on RDS; instead, do: ```sql GRANT rds_replication TO replication_user; ``` -3. Do not fallback to non-SSL connection by setting connection parameters: +3. Do not fallback to a non-SSL connection by setting connection parameters: ```toml sources.pg_replication.credentials="postgresql://loader:password@host.rds.amazonaws.com:5432/dlt_data?sslmode=require&connect_timeout=300" @@ -70,13 +70,13 @@ To get started with your data pipeline, follow these steps: 2. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../../dlt-ecosystem/destinations). -3. This source uses `sql_database` source, you can init it as follows: +3. This source uses the `sql_database` source; you can initialize it as follows: ```sh dlt init sql_database duckdb ``` :::note - It is important to note that It is now only required if a user performs an initial load, specifically when `persist_snapshots` is set to `True`. + It is important to note that it is now only required if a user performs an initial load, specifically when `persist_snapshots` is set to `True`. ::: 4. After running these two commands, a new directory will be created with the necessary files and configuration settings to get started. @@ -87,6 +87,7 @@ To get started with your data pipeline, follow these steps: You can omit the `[sql.sources.credentials]` section in `secrets.toml` as it is not required. ::: + ### Add credentials 1. In the `.dlt` folder, there's a file called `secrets.toml`. It's where you store sensitive information securely, like access tokens. Keep this file safe. @@ -162,9 +163,9 @@ def replication_resource( `pub_name`: Publication slot name to publish messages. -`include_columns`: Maps table name(s) to sequence of names of columns to include in the generated data items. Any column not in the sequence is excluded. If not provided, all columns are included +`include_columns`: Maps table name(s) to a sequence of names of columns to include in the generated data items. Any column not in the sequence is excluded. If not provided, all columns are included. -`columns`: Maps table name(s) to column hints to apply on the replicated table(s) +`columns`: Maps table name(s) to column hints to apply on the replicated table(s). `target_batch_size`: Desired number of data items yielded in a batch. Can be used to limit the data items in memory. @@ -256,7 +257,7 @@ If you wish to create your own pipelines, you can leverage source and resource m dest_pl.run(changes) ``` -8. To replicate tables with selected columns you can use the `include_columns` argument as follows: +8. To replicate tables with selected columns, you can use the `include_columns` argument as follows: ```py # requires the Postgres user to have the REPLICATION attribute assigned @@ -273,3 +274,4 @@ If you wish to create your own pipelines, you can leverage source and resource m ``` Similarly, to replicate changes from selected columns, you can use the `table_names` and `include_columns` arguments in the `replication_resource` function. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md index 85216f3206..8864b3b629 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md @@ -14,36 +14,38 @@ and customer relationship management, encompassing sales, marketing, and custome This Salesforce `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/salesforce_pipeline.py) -loads data using “Salesforce API” to the destination of your choice. +loads data using the “Salesforce API” to the destination of your choice. The resources that this verified source supports are: | Name | Mode | Description | |----------------|---------|---------------------------------------------------------------------------------------------------| -| User | replace | refers to an individual who has access to a Salesforce org or instance | -| UserRole | replace | a standard object that represents a role within the organization's hierarchy | -| Lead | replace | prospective customer/individual/org. that has shown interest in a company's products/services | -| Contact | replace | an individual person associated with an account or organization | -| Campaign | replace | marketing initiative or project designed to achieve specific goals, such as generating leads etc. | -| Product2 | replace | for managing and organizing your product-related data within the Salesforce ecosystem | -| Pricebook2 | replace | used to manage product pricing and create price books | -| PricebookEntry | replace | an object that represents a specific price for a product in a price book | -| Opportunity | merge | represents a sales opportunity for a specific account or contact | -| OpportunityLineItem | merge | represents individual line items or products associated with an opportunity | -| OpportunityContactRole | merge | represents the association between an Opportunity and a contact | -| Account | merge | individual or organization that interacts with your business | -| CampaignMember | merge | association between a contact or lead and a campaign | -| Task | merge | used to track and manage various activities and tasks within the salesforce platform | -| Event | merge | used to track and manage calendar-based events, such as meetings, appointments calls, or any other time-specific activities | - -* Note that formula fields are included - these function like Views in salesforce and will not be back-updated when their definitions change in Salesforce! The recommended handling is to ignore these fields and reproduce yourself any calculations from the base data fields. - -## Setup Guide +| User | replace | Refers to an individual who has access to a Salesforce org or instance | +| UserRole | replace | A standard object that represents a role within the organization's hierarchy | +| Lead | replace | Prospective customer/individual/org. that has shown interest in a company's products/services | +| Contact | replace | An individual person associated with an account or organization | +| Campaign | replace | Marketing initiative or project designed to achieve specific goals, such as generating leads etc. | +| Product2 | replace | For managing and organizing your product-related data within the Salesforce ecosystem | +| Pricebook2 | replace | Used to manage product pricing and create price books | +| PricebookEntry | replace | An object that represents a specific price for a product in a price book | +| Opportunity | merge | Represents a sales opportunity for a specific account or contact | +| OpportunityLineItem | merge | Represents individual line items or products associated with an opportunity | +| OpportunityContactRole | merge | Represents the association between an Opportunity and a contact | +| Account | merge | Individual or organization that interacts with your business | +| CampaignMember | merge | Association between a contact or lead and a campaign | +| Task | merge | Used to track and manage various activities and tasks within the Salesforce platform | +| Event | merge | Used to track and manage calendar-based events, such as meetings, appointments, calls, or any other time-specific activities | + +* Note that formula fields are included - these function like views in Salesforce and will not be back-updated when their definitions change in Salesforce! The recommended handling is to ignore these fields and reproduce yourself any calculations from the base data fields. + +## Setup guide + + ### Grab credentials To set up your pipeline, you'll need your Salesforce `user_name`, `password`, and `security_token`. -Use your login credentials for user_name and password. +Use your login credentials for `user_name` and `password`. To obtain the `security_token`, follow these steps: @@ -101,9 +103,9 @@ For more information, read the guide on security_token = "please set me up!" # Salesforce security token ``` -1. In `secrets.toml`, replace username and password with your Salesforce credentials. +1. In `secrets.toml`, replace `user_name` and `password` with your Salesforce credentials. -1. Update the security_token value with the token you +1. Update the `security_token` value with the token you [copied earlier](salesforce.md#grab-credentials) for secure Salesforce access. 1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to @@ -112,7 +114,7 @@ For more information, read the guide on For more information, read the [General Usage: Credentials.](../../general-usage/credentials) -## Run the pipeline +### Run the pipeline 1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: @@ -141,7 +143,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug ### Source `salesforce_source`: This function returns a list of resources to load users, user_role, opportunity, -opportunity_line_item, account etc. data from Salesforce API. +opportunity_line_item, account, etc., data from the Salesforce API. ```py @dlt.source(name="salesforce") @@ -149,7 +151,7 @@ def salesforce_source( user_name: str = dlt.secrets.value, password: str = dlt.secrets.value, security_token: str = dlt.secrets.value, -) ->Iterable[DltResource]: +) -> Iterable[DltResource]: ... ``` @@ -175,7 +177,7 @@ destination. | user_role() | contact() | lead() | campaign() | product_2() | pricebook_2() | pricebook_entry() | |-------------|-----------|--------|------------|-------------|---------------|-------------------| -The described functions fetch records from endpoints based on their names, e.g. user_role() accesses +The described functions fetch records from endpoints based on their names, e.g., user_role() accesses the "user_role" endpoint. ### Resource `opportunity` (incremental loading): @@ -197,9 +199,9 @@ def opportunity( ``` `last_timestamp`: Argument that will receive [incremental](../../general-usage/incremental-loading) -state, initialized with "initial_value". It is configured to track "SystemModstamp" field in data -item returned by "get_records" and then yielded. It will store the newest "SystemModstamp" value in -dlt state and make it available in "last_timestamp.last_value" on next pipeline run. +state, initialized with "initial_value". It is configured to track the "SystemModstamp" field in data +items returned by "get_records" and then yielded. It will store the newest "SystemModstamp" value in +dlt state and make it available in "last_timestamp.last_value" on the next pipeline run. Besides "opportunity", there are several resources that use replace mode for data writing to the destination. @@ -214,11 +216,9 @@ opportunity_line_item() accesses the "opportunity_line_item" endpoint. ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods as discussed -above. +If you wish to create your own pipelines, you can leverage source and resource methods as discussed above. -To create your data pipeline using single loading and -[incremental data loading](../../general-usage/incremental-loading), follow these steps: +To create your data pipeline using single loading and [incremental data loading](../../general-usage/incremental-loading), follow these steps: 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -230,8 +230,7 @@ To create your data pipeline using single loading and ) ``` - To read more about pipeline configuration, please refer to our - [documentation](../../general-usage/pipeline). + To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). 1. To load data from all the endpoints, use the `salesforce_source` method as follows: @@ -243,8 +242,7 @@ To create your data pipeline using single loading and print(load_info) ``` - > A hint ensures that the id column is void of null values. During data loading, dlt will verify - > that the source's id column doesn't contain nulls. + > A hint ensures that the id column is void of null values. During data loading, dlt will verify that the source's id column doesn't contain nulls. 1. To use the method `pipeline.run()` to load custom endpoints “candidates” and “members”: @@ -254,16 +252,9 @@ To create your data pipeline using single loading and print(load_info) ``` - In the initial run, the "opportunity" and "contact" endpoints load all data using 'merge' mode - and 'last_timestamp' set to "None". In subsequent runs, only data after - 'last_timestamp.last_value' (from the previous run) is merged. Incremental loading is specific to - endpoints in merge mode with the “dlt.sources.incremental” parameter. + In the initial run, the "opportunity" and "contact" endpoints load all data using 'merge' mode and 'last_timestamp' set to "None". In subsequent runs, only data after 'last_timestamp.last_value' (from the previous run) is merged. Incremental loading is specific to endpoints in merge mode with the “dlt.sources.incremental” parameter. - > For incremental loading of endpoints, maintain the pipeline name and destination dataset name. - > The pipeline name is important for accessing the [state](../../general-usage/state) from the - > last run, including the end date for incremental data loads. Altering these names could trigger - > a [“dev-mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), disrupting - > the metadata tracking for [incremental data loading](../../general-usage/incremental-loading). + > For incremental loading of endpoints, maintain the pipeline name and destination dataset name. The pipeline name is important for accessing the [state](../../general-usage/state) from the last run, including the end date for incremental data loads. Altering these names could trigger a [“dev-mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), disrupting the metadata tracking for [incremental data loading](../../general-usage/incremental-loading). 1. To load data from the “contact” in replace mode and “task” incrementally merge mode endpoints: @@ -273,23 +264,13 @@ To create your data pipeline using single loading and print(load_info) ``` - > Note: In the referenced pipeline, the "contact" parameter is always loaded in "replace" mode, - > overwriting existing data. Conversely, the "task" endpoint supports "merge" mode for - > incremental loads, updating or adding data based on the 'last_timestamp' value without erasing - > previously loaded data. + > Note: In the referenced pipeline, the "contact" parameter is always loaded in "replace" mode, overwriting existing data. Conversely, the "task" endpoint supports "merge" mode for incremental loads, updating or adding data based on the 'last_timestamp' value without erasing previously loaded data. -1. Salesforce enforces specific limits on API data requests. These limits - vary based on the Salesforce edition and license type, as outlined in the [Salesforce API Request Limits documentation](https://developer.salesforce.com/docs/atlas.en-us.salesforce_app_limits_cheatsheet.meta/salesforce_app_limits_cheatsheet/salesforce_app_limits_platform_api.htm). +1. Salesforce enforces specific limits on API data requests. These limits vary based on the Salesforce edition and license type, as outlined in the [Salesforce API Request Limits documentation](https://developer.salesforce.com/docs/atlas.en-us.salesforce_app_limits_cheatsheet.meta/salesforce_app_limits_cheatsheet/salesforce_app_limits_platform_api.htm). - To limit the number of Salesforce API data requests, developers can control the environment for production or - development purposes. For development, you can set the `IS_PRODUCTION` variable - to `False` in "[salesforce/settings.py](https://github.com/dlt-hub/verified-sources/blob/master/sources/salesforce/settings.py)", - which limits API call requests to 100. To modify this limit, you can update the query limit in - "[salesforce/helpers.py](https://github.com/dlt-hub/verified-sources/blob/756edaa00f56234cd06699178098f44c16d6d597/sources/salesforce/helpers.py#L56)" - as required. + To limit the number of Salesforce API data requests, developers can control the environment for production or development purposes. For development, you can set the `IS_PRODUCTION` variable to `False` in "[salesforce/settings.py](https://github.com/dlt-hub/verified-sources/blob/master/sources/salesforce/settings.py)", which limits API call requests to 100. To modify this limit, you can update the query limit in "[salesforce/helpers.py](https://github.com/dlt-hub/verified-sources/blob/756edaa00f56234cd06699178098f44c16d6d597/sources/salesforce/helpers.py#L56)" as required. - >To read more about Salesforce query limits, please refer to their official - >[documentation here](https://developer.salesforce.com/docs/atlas.en-us.soql_sosl.meta/soql_sosl/sforce_api_calls_soql_select_limit.htm). + > To read more about Salesforce query limits, please refer to their official [documentation here](https://developer.salesforce.com/docs/atlas.en-us.soql_sosl.meta/soql_sosl/sforce_api_calls_soql_select_limit.htm). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md index ae526668f2..fe11491bd6 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/shopify.md @@ -16,7 +16,7 @@ referrals. This Shopify `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/shopify_dlt_pipeline.py) -loads data using 'Shopify API' or 'Shopify Partner API' to the destination of your choice. +loads data using the 'Shopify API' or 'Shopify Partner API' to the destination of your choice. The resources that this verified source supports are: @@ -25,14 +25,14 @@ The resources that this verified source supports are: | customers | Individuals or entities who have created accounts on a Shopify-powered online store | | orders | Transactions made by customers on an online store | | products | The individual items or goods that are available for sale | -| shopify_partner_query | To query data using GraphQL queries from Shopify partner API | +| shopify_partner_query | To query data using GraphQL queries from the Shopify partner API | -## Setup Guide +## Setup guide ### Grab credentials #### Grab Admin API access token -To load data using Shopify API, you need an Admin API access token. This token can be obtained by following +To load data using the Shopify API, you need an Admin API access token. This token can be obtained by following these steps: 1. Log in to Shopify. @@ -44,10 +44,10 @@ these steps: 1. Grant read access in “Admin API access scopes.” 1. Save the configuration. 1. Hit “Install app” and confirm. -1. Reveal and copy the Admin API token. Store safely; it's shown only once. +1. Reveal and copy the Admin API token. Store it safely; it's shown only once. #### Grab Partner API access token -To load data using Shopify Partner API, you need an Partner API access token. This token can be obtained by following +To load data using the Shopify Partner API, you need a Partner API access token. This token can be obtained by following these steps: 1. Log in to Shopify Partners and click the settings icon⚙️ at the bottom left. @@ -55,7 +55,7 @@ these steps: 1. Create an API client with a suitable name and assign necessary permissions. 1. Save and create the API client, then click to show and copy the access token securely. -> Note: The Shopify and Shopify Partner UI, described here might change. +> Note: The Shopify and Shopify Partner UI, described here, might change. The full guide is available at [this link.](https://www.shopify.com/partners/blog/17056443-how-to-generate-a-shopify-api-token) ### Initialize the verified source @@ -83,9 +83,7 @@ For more information, read the guide on [how to add a verified source](../../wal ### Add credential -1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can - securely store your access tokens and other sensitive information. It's important to handle this - file with care and keep it safe. +1. Inside the `.dlt` folder, you'll find a file called `secrets.toml`, which is where you can securely store your access tokens and other sensitive information. It's important to handle this file with care and keep it safe. Here's what the file looks like: @@ -93,14 +91,14 @@ For more information, read the guide on [how to add a verified source](../../wal #shopify [sources.shopify_dlt] private_app_password="Please set me up!" #Admin API access token - access_token=" Please set me up!" #Partner API acess token + access_token="Please set me up!" #Partner API access token ``` 1. Update `private_app_password` with the "Admin API access token". 1. Similarly, update the `access_token` with the "Partner API access token". - >To load data using Shopify API, update the `private_app_password`. - >To load data using Shopify partner API, update the `access_token`. + >To load data using the Shopify API, update the `private_app_password`. + >To load data using the Shopify partner API, update the `access_token`. 1. Next, store your pipeline configuration details in the `.dlt/config.toml`. @@ -108,26 +106,21 @@ For more information, read the guide on [how to add a verified source](../../wal ```toml [sources.shopify_dlt] - shop_url = "Please set me up !" + shop_url = "Please set me up!" organization_id = "Please set me up!" ``` -1. Update `shop_url` with the URL of your Shopify store. For example, - "https://shop-123.myshopify.com/%E2%80%9D". +1. Update `shop_url` with the URL of your Shopify store. For example, "https://shop-123.myshopify.com/". -1. Update `organization_id` with a code from your Shopify partner URL. For example in - "https://partners.shopify.com/1234567", the code '1234567' is the organization ID. +1. Update `organization_id` with a code from your Shopify partner URL. For example, in "https://partners.shopify.com/1234567", the code '1234567' is the organization ID. -1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to - add credentials for your chosen destination, ensuring proper routing of your data to the final - destination. +1. Next, follow the [destination documentation](../../dlt-ecosystem/destinations) instructions to add credentials for your chosen destination, ensuring proper routing of your data to the final destination. For more information, read the [General Usage: Credentials.](../../general-usage/credentials) ## Run the pipeline -1. Before running the pipeline, ensure that you have installed all the necessary dependencies by - running the command: +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: ```sh pip install -r requirements.txt ``` @@ -135,25 +128,21 @@ For more information, read the [General Usage: Credentials.](../../general-usage ```sh python shopify_dlt_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using - the following command: +1. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh dlt pipeline show ``` - For example, the `pipeline_name` for the above pipeline example is `shopify_data`, you may also - use any custom name instead. + For example, the `pipeline_name` for the above pipeline example is `shopify_data`, you may also use any custom name instead. For more information, read the guide on [how to run a pipeline](../../walkthroughs/run-a-pipeline). ## Sources and resources -`dlt` works on the principle of [sources](../../general-usage/source) and -[resources](../../general-usage/resource). +`dlt` works on the principle of [sources](../../general-usage/source) and [resources](../../general-usage/resource). ### Source `shopify_source`: -This function returns a list of resources to load products, orders, and customers data from Shopify -API. +This function returns a list of resources to load products, orders, and customers data from the Shopify API. ```py def shopify_source( @@ -177,11 +166,9 @@ def shopify_source( `items_per_page`: Max items fetched per page (Default: 250). -`start_date`: Imports items updated since this date (Default: 2000-01-01). Used for incremental -loading if end_time isn't specified. Accepts ISO 8601 date/datetime formats. +`start_date`: Imports items updated since this date (Default: 2000-01-01). Used for incremental loading if end_time isn't specified. Accepts ISO 8601 date/datetime formats. -`end_time`: Data load range end time. Paired with start_date for specified time range. Enables -incremental loading if unspecified. +`end_time`: Data load range end time. Paired with start_date for specified time range. Enables incremental loading if unspecified. `created_at_min`: Load items created since this date (Default: 2000-01-01). @@ -189,8 +176,7 @@ incremental loading if unspecified. ### Resource `products`: -This resource loads products from your Shopify shop into the destination. It supports incremental -loading and pagination. +This resource loads products from your Shopify shop into the destination. It supports incremental loading and pagination. ```py @dlt.resource(primary_key="id", write_disposition="merge") @@ -211,8 +197,7 @@ def products( `updated_at`: The saved [state](../../general-usage/state) of the last 'updated_at' value. -Similar to the mentioned resource, there are two more resources "orders" and "customers", both -support incremental loading and pagination. +Similar to the mentioned resource, there are two more resources "orders" and "customers", both support incremental loading and pagination. ### Resource `shopify_partner_query`: This resource can be used to run custom GraphQL queries to load paginated data. @@ -236,7 +221,7 @@ def shopify_partner_query( `data_items_path`: JSONPath to array items in query results. -`pagination_cursor_path`: The JSONPath to the pagination cursor in the query result, will be piped to the next query via variables. +`pagination_cursor_path`: The JSONPath to the pagination cursor in the query result, which will be piped to the next query via variables. `pagination_variable_name`: The name of the variable to pass the pagination cursor to. @@ -246,14 +231,15 @@ def shopify_partner_query( `organization_id`: Your Organization ID, found in the Partner Dashboard. -`api_version`: The API version to use (e.g. 2024-01). Use `unstable` for the latest version. +`api_version`: The API version to use (e.g., 2024-01). Use `unstable` for the latest version. ## Customization + + ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -265,10 +251,9 @@ verified source. ) ``` - To read more about pipeline configuration, please refer to our - [documentation](../../general-usage/pipeline). + To read more about pipeline configuration, please refer to our [documentation](../../general-usage/pipeline). -1. To load data from "products", "orders" and "customers" from 1st Jan 2023. +1. To load data from "products", "orders", and "customers" from January 1, 2023: ```py # Add your desired resources to the list... @@ -280,9 +265,7 @@ verified source. print(load_info) ``` -1. To load past Shopify orders in weekly chunks using start_date and end_date parameters. This - minimizes potential failure during large data loads. Running chunks and incremental loads in - parallel accelerates the initial load. +1. To load past Shopify orders in weekly chunks using start_date and end_date parameters. This minimizes potential failure during large data loads. Running chunks and incremental loads in parallel accelerates the initial load. ```py # Load all orders from 2023-01-01 to now @@ -315,7 +298,7 @@ verified source. ) print(load_info) ``` -1. To load the first 10 transactions via GraphQL query from the Shopify Partner API. +1. To load the first 10 transactions via a GraphQL query from the Shopify Partner API. ```py # Construct query to load transactions 100 per page, the `$after` variable is used to paginate query = """query Transactions($after: String) { @@ -330,7 +313,7 @@ verified source. } """ - # Configure the resource with the query and json paths to extract the data and pagination cursor + # Configure the resource with the query and JSON paths to extract the data and pagination cursor resource = shopify_partner_query( query, # JSON path pointing to the data item in the results @@ -346,3 +329,4 @@ verified source. ``` + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/index.md index a8146c75fe..22f9c23a06 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/index.md @@ -5,12 +5,11 @@ keywords: [sql connector, sql database pipeline, sql database] --- import Header from '../_source-info-header.md'; -# 30+ SQL Databases +# 30+ SQL databases
-SQL databases are management systems (DBMS) that store data in a structured format, commonly used -for efficient and reliable data retrieval. +SQL databases are management systems (DBMS) that store data in a structured format, commonly used for efficient and reliable data retrieval. The SQL Database verified source loads data to your specified destination using one of the following backends: SQLAlchemy, PyArrow, pandas, or ConnectorX. @@ -18,7 +17,7 @@ Sources and resources that can be loaded using this verified source are: | Name | Description | | ------------ | -------------------------------------------------------------------- | -| sql_database | Reflects the tables and views in SQL database and retrieves the data | +| sql_database | Reflects the tables and views in an SQL database and retrieves the data | | sql_table | Retrieves data from a particular SQL database table | | | | @@ -47,5 +46,6 @@ We support all [SQLAlchemy dialects](https://docs.sqlalchemy.org/en/20/dialects/ * Teradata Vantage :::note -Note that there many unofficial dialects, such as [DuckDB](https://duckdb.org/). -::: \ No newline at end of file +Note that there are many unofficial dialects, such as [DuckDB](https://duckdb.org/). +::: + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/setup.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/setup.md index a91ae40028..5af23570bb 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/setup.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/setup.md @@ -10,9 +10,9 @@ import Header from '../_source-info-header.md';
-To connect to your SQL database using `dlt` follow these steps: +To connect to your SQL database using `dlt`, follow these steps: -1. Initialize a `dlt` project in the current working directory by running the following command: +1. Initialize a `dlt` project in the current working directory by running the following command: ```sh dlt init sql_database duckdb @@ -43,11 +43,11 @@ If you'd like to use a different destination, simply replace `duckdb` with the n credentials="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" ``` - To learn more about how to add credentials into your `sql_database` pipeline see [here](./configuration#configuring-the-connection). + To learn more about how to add credentials into your `sql_database` pipeline, see [here](./configuration#configuring-the-connection). 3. Add credentials for your destination (if necessary) - Depending on which [destination](../../destinations) you're loading into, you might also need to add your destination credentials. For more information read the [General Usage: Credentials.](../../../general-usage/credentials) + Depending on which [destination](../../destinations) you're loading into, you might also need to add your destination credentials. For more information, read the [General Usage: Credentials.](../../../general-usage/credentials) 4. Install any necessary dependencies @@ -61,7 +61,7 @@ If you'd like to use a different destination, simply replace `duckdb` with the n python sql_database_pipeline.py ``` - Executing this command will run the example script `sql_database_pipeline.py` created in step 1. In order for this to run successfully you will need to pass the names of the databases and/or tables you wish to load. + Executing this command will run the example script `sql_database_pipeline.py` created in step 1. In order for this to run successfully, you will need to pass the names of the databases and/or tables you wish to load. See the [section on configuring the sql_database source](./configuration#configuring-the-sql-database-source) for more details. @@ -73,4 +73,5 @@ If you'd like to use a different destination, simply replace `duckdb` with the n :::note The pipeline_name for the above example is `rfam`, you may also use any custom name instead. - ::: \ No newline at end of file + ::: + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/troubleshooting.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/troubleshooting.md index 33986fb5a6..d0930716d8 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/troubleshooting.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/troubleshooting.md @@ -15,7 +15,7 @@ import Header from '../_source-info-header.md'; #### Connecting to MySQL with SSL Here, we use the `mysql` and `pymysql` dialects to set up an SSL connection to a server, with all information taken from the [SQLAlchemy docs](https://docs.sqlalchemy.org/en/14/dialects/mysql.html#ssl-connections). -1. To enforce SSL on the client without a client certificate you may pass the following DSN: +1. To enforce SSL on the client without a client certificate, you may pass the following DSN: ```toml sources.sql_database.credentials="mysql+pymysql://root:@:3306/mysql?ssl_ca=" @@ -38,22 +38,22 @@ Here, we use the `mysql` and `pymysql` dialects to set up an SSL connection to a **To connect to an `mssql` server using Windows authentication**, include `trusted_connection=yes` in the connection string. ```toml -sources.sql_database.credentials="mssql+pyodbc://loader.database.windows.net/dlt_data?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server" +sources.sql_database.credentials="mssql+pyodbc://loader.database.windows.net/dlt_data?trusted_connection=yes&driver=ODBC+Driver 17+for+SQL+Server" ``` -**To connect to a local sql server instance running without SSL** pass `encrypt=no` parameter: +**To connect to a local SQL server instance running without SSL**, pass the `encrypt=no` parameter: ```toml -sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?encrypt=no&driver=ODBC+Driver+17+for+SQL+Server" +sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?encrypt=no&driver=ODBC+Driver 17+for+SQL+Server" ``` -**To allow self signed SSL certificate** when you are getting `certificate verify failed:unable to get local issuer certificate`: +**To allow a self-signed SSL certificate** when you are getting `certificate verify failed: unable to get local issuer certificate`: ```toml -sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?TrustServerCertificate=yes&driver=ODBC+Driver+17+for+SQL+Server" +sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?TrustServerCertificate=yes&driver=ODBC+Driver 17+for+SQL+Server" ``` **To use long strings (>8k) and avoid collation errors**: ```toml -sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?LongAsMax=yes&driver=ODBC+Driver+17+for+SQL+Server" +sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?LongAsMax=yes&driver=ODBC+Driver 17+for+SQL+Server" ``` ## Troubleshooting backends @@ -61,7 +61,7 @@ sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_dat ### Notes on specific databases #### Oracle -1. When using the `oracledb` dialect in thin mode we are getting protocol errors. Use thick mode or `cx_oracle` (old) client. +1. When using the `oracledb` dialect in thin mode, we are getting protocol errors. Use thick mode or the `cx_oracle` (old) client. 2. Mind that `SQLAlchemy` translates Oracle identifiers into lower case! Keep the default `dlt` naming convention (`snake_case`) when loading data. We'll support more naming conventions soon. 3. `Connectorx` is for some reason slower for Oracle than the `PyArrow` backend. @@ -69,21 +69,22 @@ See [here](https://github.com/dlt-hub/sql_database_benchmarking/tree/main/oracle #### DB2 1. Mind that `SQLAlchemy` translates DB2 identifiers into lower case! Keep the default `dlt` naming convention (`snake_case`) when loading data. We'll support more naming conventions soon. -2. The DB2 type `DOUBLE` gets incorrectly mapped to the python type `float` (instead of the `SqlAlchemy` type `Numeric` with default precision). This requires `dlt` to perform additional casts. The cost of the cast, however, is minuscule compared to the cost of reading rows from database. +2. The DB2 type `DOUBLE` gets incorrectly mapped to the Python type `float` (instead of the `SQLAlchemy` type `Numeric` with default precision). This requires `dlt` to perform additional casts. The cost of the cast, however, is minuscule compared to the cost of reading rows from the database. -See [here](https://github.com/dlt-hub/sql_database_benchmarking/tree/main/db2#installing-and-setting-up-db2) for information and code on setting up and benchmarking on db2. +See [here](https://github.com/dlt-hub/sql_database_benchmarking/tree/main/db2#installing-and-setting-up-db2) for information and code on setting up and benchmarking on DB2. #### MySQL -1. The `SqlAlchemy` dialect converts doubles to decimals. (This can be disabled via the table adapter argument as shown in the code example [here](./configuration#pyarrow)) +1. The `SQLAlchemy` dialect converts doubles to decimals. (This can be disabled via the table adapter argument as shown in the code example [here](./configuration#pyarrow)) #### Postgres / MSSQL -No issues were found for these databases. Postgres is the only backend where we observed 2x speedup with `ConnectorX` (see [here](https://github.com/dlt-hub/sql_database_benchmarking/tree/main/postgres) for the benchmarking code). On other db systems it performs the same as (or some times worse than) the `PyArrow` backend. - +No issues were found for these databases. Postgres is the only backend where we observed a 2x speedup with `ConnectorX` (see [here](https://github.com/dlt-hub/sql_database_benchmarking/tree/main/postgres) for the benchmarking code). On other db systems, it performs the same as (or sometimes worse than) the `PyArrow` backend. + ### Notes on specific data types #### JSON -In the `SQLAlchemy` backend JSON data type is represented as a Python object, and in the `PyArrow` backend, it is represented as a JSON string. At present it does not work correctly with `pandas` and `ConnectorX`which cast Python objects to `str`, generating invalid JSON strings that cannot be loaded into destination. +In the `SQLAlchemy` backend, the JSON data type is represented as a Python object, and in the `PyArrow` backend, it is represented as a JSON string. At present, it does not work correctly with `pandas` and `ConnectorX`, which cast Python objects to `str`, generating invalid JSON strings that cannot be loaded into the destination. #### UUID -UUIDs are represented as string by default. You can switch this behavior by using `table_adapter_callback` to modify properties of the UUID type for a particular column. (See the code example [here](./configuration#pyarrow) for how to modify the data type properties of a particular column.) \ No newline at end of file +UUIDs are represented as strings by default. You can switch this behavior by using `table_adapter_callback` to modify properties of the UUID type for a particular column. (See the code example [here](./configuration#pyarrow) for how to modify the data type properties of a particular column.) + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/usage.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/usage.md index bb2f39b007..bdc440630d 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/usage.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/usage.md @@ -12,7 +12,7 @@ import Header from '../_source-info-header.md'; ## Applying column-wise filtering on the data being ingested -By default, the existing source and resource functions, `sql_database` and `sql_table`, ingest all of the records from the source table. But by using `query_adapter_callback`, it is possible to pass a `WHERE` clause inside the underlying `SELECT` statement using the [SQLAlchemy syntax](https://docs.sqlalchemy.org/en/14/core/selectable.html#). Thich enables filtering the data based on specific columns before extract. +By default, the existing source and resource functions, `sql_database` and `sql_table`, ingest all of the records from the source table. However, by using `query_adapter_callback`, it is possible to pass a `WHERE` clause inside the underlying `SELECT` statement using the [SQLAlchemy syntax](https://docs.sqlalchemy.org/en/14/core/selectable.html#). This enables filtering the data based on specific columns before extraction. The example below uses `query_adapter_callback` to filter on the column `customer_id` for the table `orders`: @@ -32,11 +32,10 @@ source = sql_database( ``` ## Transforming the data before load -You have direct access to the extracted data through the resource objects (`sql_table()` or `sql_database().with_resource())`), each of which represents a single SQL table. These objects are generators that yield -individual rows of the table which can be modified by using custom python functions. These functions can be applied to the resource using `add_map`. +You have direct access to the extracted data through the resource objects (`sql_table()` or `sql_database().with_resource())`), each of which represents a single SQL table. These objects are generators that yield individual rows of the table, which can be modified by using custom Python functions. These functions can be applied to the resource using `add_map`. :::note -The PyArrow backend does not yield individual rows rather loads chunks of data as `ndarray`. In this case, the transformation function that goes into `add_map` should be configured to expect an `ndarray` input. +The PyArrow backend does not yield individual rows but loads chunks of data as `ndarray`. In this case, the transformation function that goes into `add_map` should be configured to expect an `ndarray` input. ::: @@ -50,7 +49,7 @@ Examples: def pseudonymize_name(doc): ''' - Pseudonmyisation is a deterministic type of PII-obscuring + Pseudonymization is a deterministic type of PII-obscuring. Its role is to allow identifying users by their hash, without revealing the underlying info. ''' @@ -99,10 +98,11 @@ Examples: ## Deploying the sql_database pipeline -You can deploy the `sql_database` pipeline with any of the `dlt` deployment methods, such as [GitHub Actions](../../../walkthroughs/deploy-a-pipeline/deploy-with-github-actions), [Airflow](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer), [Dagster](../../../walkthroughs/deploy-a-pipeline/deploy-with-dagster) etc. See [here](../../../walkthroughs/deploy-a-pipeline) for a full list of deployment methods. +You can deploy the `sql_database` pipeline with any of the `dlt` deployment methods, such as [GitHub Actions](../../../walkthroughs/deploy-a-pipeline/deploy-with-github-actions), [Airflow](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer), [Dagster](../../../walkthroughs/deploy-a-pipeline/deploy-with-dagster), etc. See [here](../../../walkthroughs/deploy-a-pipeline) for a full list of deployment methods. ### Running on Airflow When running on Airflow: -1. Use the `dlt` [Airflow Helper](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file) to create tasks from the `sql_database` source. (If you want to run table extraction in parallel, then you can do this by setting `decompose = "parallel-isolated"` when doing the source->DAG conversion. See [here](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer#2-modify-dag-file) for code example.) -2. Reflect tables at runtime with `defer_table_reflect` argument. +1. Use the `dlt` [Airflow Helper](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file) to create tasks from the `sql_database` source. (If you want to run table extraction in parallel, you can do this by setting `decompose = "parallel-isolated"` when doing the source->DAG conversion. See [here](../../../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer#2-modify-dag-file) for a code example.) +2. Reflect tables at runtime with the `defer_table_reflect` argument. 3. Set `allow_external_schedulers` to load data using [Airflow intervals](../../../general-usage/incremental-loading.md#using-airflow-schedule-for-backfill-and-incremental-loading). + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md index a9d70c338c..3718ab7110 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md @@ -17,7 +17,7 @@ you'll ingest to transfer data to your warehouse. This Strapi `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/strapi_pipeline.py) -loads data using “Strapi API” to the destination of your choice. +loads data using the “Strapi API” to the destination of your choice. Sources and resources that can be loaded using this verified source are: @@ -25,7 +25,7 @@ Sources and resources that can be loaded using this verified source are: | ------------- | -------------------------- | | strapi_source | Retrieves data from Strapi | -## Setup Guide +## Setup guide ### Grab API token @@ -86,7 +86,7 @@ For more information, read the guide on [how to add a verified source](../../wal 1. Finally, enter credentials for your chosen destination as per the [docs](../destinations/). -For more information, read the [General Usage: Credentials.](../../general-usage/credentials) +For more information, read the [General usage: Credentials.](../../general-usage/credentials) ## Run the pipeline @@ -126,7 +126,7 @@ For more information, read the guide on [how to run a pipeline](../../walkthroug ### Source `strapi_source` -This function retrives data from Strapi. +This function retrieves data from Strapi. ```py @dlt.source @@ -176,3 +176,4 @@ verified source. > requirements. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md index 73565f7e94..1d6f59dd3e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/workable.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/workable.md @@ -10,12 +10,11 @@ import Header from './_source-info-header.md';
[Workable](https://www.workable.com/) is an online platform for posting jobs and managing the hiring process. With Workable, -employers can create job listings, receive applications, track candidates, collaborate with team -members, schedule interviews, and manage the overall hiring workflow. +employers can create job listings, receive applications, track candidates, collaborate with team members, schedule interviews, and manage the overall hiring workflow. This Workable `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/workable_pipeline.py) -loads data using “Workable API” to the destination of your choice. +loads data using the “Workable API” to the destination of your choice. ### Default endpoints @@ -23,32 +22,34 @@ This verified source loads data from the following default endpoints: | Name | Description | | ----------------- | ------------------------------------------------------------------------------------- | -| members | individuals who have access to your Workable account | -| recruiters | individuals who are responsible for managing the hiring and recruitment processes | -| stages | represent the different steps or phases in the hiring process for a job position | -| requisitions | formal request made by an organization to fill a specific job opening or position | -| jobs | individual job postings or job listings created by employers or recruiters | -| custom_attributes | additional fields or data points that you can define and assign to candidates or jobs | -| events | specific occurrences or actions related to the hiring and recruitment process | -| candidates | individuals who have applied for job positions within an organization | +| members | Individuals who have access to your Workable account | +| recruiters | Individuals who are responsible for managing the hiring and recruitment processes | +| stages | Represent the different steps or phases in the hiring process for a job position | +| requisitions | Formal requests made by an organization to fill a specific job opening or position | +| jobs | Individual job postings or job listings created by employers or recruiters | +| custom_attributes | Additional fields or data points that you can define and assign to candidates or jobs | +| events | Specific occurrences or actions related to the hiring and recruitment process | +| candidates | Individuals who have applied for job positions within an organization | ### Dependent endpoints -Besides the main endpoints, for "candidate" and "jobs" endpoints, the following are their dependent endpoints: +Besides the main endpoints, for the "candidate" and "jobs" endpoints, the following are their dependent endpoints: | Name | Dependent endpoints | | --------------------------------- | -------------------------------------------------------------------------------------------------- | -| candidates/:id/activities | retrieve activities or events related to the candidate's interaction with the hiring process. | -| candidates/:id/offer | a specific candidate's offer information | -| jobs/:shortcode/activities | activities associated with a particular job posting identified by its shortcode | -| jobs/:shortcode/application_form | application form details for a specified job | -| jobs/:shortcode/questions | retrieve the interview questions associated with a specific job posting | -| jobs/:shortcode/stages | retrieve information about the hiring stages associated with a particular job | -| jobs/:shortcode/custom_attributes | retrieve custom attributes associated with a particular job posting | -| jobs/:shortcode/members | retrieve information about the members associated with a particular job within the Workable system | -| jobs/:shortcode/recruiters | retrieve the list of recruiters associated with a particular job. | - -## Setup Guide +| candidates/:id/activities | Retrieve activities or events related to the candidate's interaction with the hiring process. | +| candidates/:id/offer | A specific candidate's offer information | +| jobs/:shortcode/activities | Activities associated with a particular job posting identified by its shortcode | +| jobs/:shortcode/application_form | Application form details for a specified job | +| jobs/:shortcode/questions | Retrieve the interview questions associated with a specific job posting | +| jobs/:shortcode/stages | Retrieve information about the hiring stages associated with a particular job | +| jobs/:shortcode/custom_attributes | Retrieve custom attributes associated with a particular job posting | +| jobs/:shortcode/members | Retrieve information about the members associated with a particular job within the Workable system | +| jobs/:shortcode/recruiters | Retrieve the list of recruiters associated with a particular job. | + +## Setup guide + + ### Grab API credentials @@ -101,7 +102,7 @@ For more information, read the guide on [how to add a verified source.](../../wa [you copied above](workable.md#grab-api-credentials). This will ensure that your data pipeline example can access your Workable resources securely. -1. Next you need to configure ".dlt/config.toml", which looks like: +1. Next, you need to configure ".dlt/config.toml", which looks like: ```toml [sources.workable] @@ -184,8 +185,8 @@ def workable_source( `start_date`: Optional. Sets a data retrieval start date; defaults to January 1, 2000. -`load_details`: A boolean parameter. Set to true to load dependent endpoints with main ones (”jobs” -& “candidates”). +`load_details`: A boolean parameter. Set to true to load dependent endpoints with main ones ("jobs" +& "candidates"). ### Resource `candidate_resource` @@ -205,14 +206,12 @@ def candidates_resource( 1, 2000 if undefined. ## Customization + ### Create your own pipeline -If you wish to create your own pipelines, you can leverage source and resource methods from this -verified source. +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. -To create your data pipeline using single loading and -[incremental data loading](../../general-usage/incremental-loading) (only for the -**Candidates** endpoint), follow these steps: +To create your data pipeline using single loading and [incremental data loading](../../general-usage/incremental-loading) (only for the **Candidates** endpoint), follow these steps: 1. Configure the pipeline by specifying the pipeline name, destination, and dataset as follows: @@ -232,8 +231,7 @@ To create your data pipeline using single loading and print(load_info) ``` - > Note: In the run, the "candidates" endpoint loads incrementally via 'merge' mode using - > 'updated_by'. All other endpoints load in 'replace' mode. + > Note: In the run, the "candidates" endpoint loads incrementally via 'merge' mode using 'updated_by'. All other endpoints load in 'replace' mode. 1. To load data from a specific date, including dependent endpoints: @@ -257,8 +255,7 @@ To create your data pipeline using single loading and > Note: "candidates" loads incrementally in merge mode, while "members" uses replace mode. -1. To load data from the “jobs” endpoint and its dependent endpoints like "activities" and - "application_form": +1. To load data from the “jobs” endpoint and its dependent endpoints like "activities" and "application_form": ```py load_data = workable_source(start_date=datetime(2022, 2, 1), load_details=True) @@ -268,12 +265,7 @@ To create your data pipeline using single loading and ``` > Note: "load_details" parameter is set to True. -1. To use incremental loading for the candidates endpoint, maintain the same pipeline and - destination dataset names. The pipeline name helps retrieve the - [state](../../general-usage/state) of the last run, essential for incremental - data loading. Changing these names might trigger a - [“dev_mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), - disrupting metadata tracking for - [incremental data loading](../../general-usage/incremental-loading). +1. To use incremental loading for the candidates endpoint, maintain the same pipeline and destination dataset names. The pipeline name helps retrieve the [state](../../general-usage/state) of the last run, essential for incremental data loading. Changing these names might trigger a [“dev_mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), disrupting metadata tracking for [incremental data loading](../../general-usage/incremental-loading). + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md index cfccf5d675..b34bc83087 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md @@ -15,7 +15,7 @@ analytics, and talks. This Zendesk `dlt` verified source and [pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/zendesk_pipeline.py) -loads data using “Zendesk Support API”, "Zendesk Chat API" and "Zendesk Talk API" to the destination +loads data using the “Zendesk Support API”, "Zendesk Chat API", and "Zendesk Talk API" to the destination of your choice. Endpoints that can be loaded using this verified source are: @@ -23,24 +23,24 @@ Endpoints that can be loaded using this verified source are: | Name | Description | | -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | SUPPORT_ENDPOINTS | "users", "sla_policies", "groups", "organizations", "brands" | -| SUPPORT_EXTRA_ENDPOINTS | "activities", "automations", "custom_agent_roles", "dynamic_content", "group_memberships",
"job_status","macros", "organization_fields", "organization_memberships", "recipient_addresses",
"requests" , "satisfaction_ratings", "sharing_agreements", "skips", "suspended_tickets","targets",
"ticket_forms", "ticket_metrics", "triggers", "user_fields", "views", "tags" | +| SUPPORT_EXTRA_ENDPOINTS | "activities", "automations", "custom_agent_roles", "dynamic_content", "group memberships",
"job_status", "macros", "organization_fields", "organization memberships", "recipient_addresses",
"requests", "satisfaction_ratings", "sharing_agreements", "skips", "suspended_tickets", "targets",
"ticket_forms", "ticket_metrics", "triggers", "user_fields", "views", "tags" | | TALK_ENDPOINTS | "calls", "addresses", "greeting_categories", "greetings", "ivrs",
"phone_numbers", "settings", "lines", "agents_activity" | | INCREMENTAL_TALK_ENDPOINTS | "calls", "logs" | -> To the get the complete list of endpoints, please refer to +> To get the complete list of endpoints, please refer to > ["zendesk/settings.py".](https://github.com/dlt-hub/verified-sources/blob/master/sources/zendesk/settings.py) -## Setup Guide +## Setup guide ### Grab credentials -You can load data from three types of Zendesk services, that are : +You can load data from three types of Zendesk services, which are: - Zendesk Support - Zendesk Chat - Zendesk Talk -### Zendesk Support +### Zendesk support Zendesk support can be authenticated using one of the following three methods: @@ -52,7 +52,7 @@ The simplest way to authenticate is via subdomain + email address + password, si are already available and you don't have to generate any tokens. Alternatively, you can also use API tokens or OAuth tokens. -#### Grab Subdomain +#### Grab subdomain 1. Log into Zendesk to find your subdomain in the URL. E.g., for https://www.dlthub.zendesk.com, the subdomain is "dlthub". @@ -60,7 +60,7 @@ tokens or OAuth tokens. #### Grab Zendesk support API token 1. In Zendesk (top right), select Admin Center. -1. Choose "Apps and Integrations. +1. Choose "Apps and Integrations". 1. Navigate to APIs and select Zendesk API. Activate “Password access” & “Token access”. 1. Click “Add API token”, add a description, and note down the API token. 1. The token displays just once; note it safely. @@ -78,8 +78,8 @@ Here's a summarized version: 1. Alternatively, fetch client ID via OAuth [using this](https://developer.zendesk.com/documentation/ticketing/working-with-oauth/creating-and-using-oauth-tokens-with-the-api/#getting-an-oauth-clients-id). -1. To get full token using the client id obtained above, you can follow the [instructions - here.](https://developer.zendesk.com/documentation/ticketing/working-with-oauth/creating-and-using-oauth-tokens-with-the-api/#creating-the-access-token) +1. To get the full token using the client ID obtained above, you can follow the [instructions + here](https://developer.zendesk.com/documentation/ticketing/working-with-oauth/creating-and-using-oauth-tokens-with-the-api/#creating-the-access-token). ```sh curl https://{subdomain}.zendesk.com/api/v2/oauth/tokens.json \ @@ -98,10 +98,10 @@ Here's a summarized version: > We've set the scope as 'read', but you can customize the scope as needed. -1. In response to the above request you'll get a full token which can be used to configure Zendesk +1. In response to the above request, you'll get a full token which can be used to configure Zendesk support. -### Zendesk Chat +### Zendesk chat Zendesk chat can be authenticated using this method: @@ -116,13 +116,13 @@ subdomain is "dlthub". #### Grab Zendesk chat OAuth token -To generate Zendesk chat OAuth token, please refer to this +To generate a Zendesk chat OAuth token, please refer to this [documentation](https://support.zendesk.com/hc/en-us/articles/4408828740762-Chat-API-tutorial-Generating-an-OAuth-token-integrated-Chat-accounts-#:~:text=Create%20the%20OAuth%20API%20client,-First%20of%20all&text=Go%20to%20Zendesk%20Chat%20%3E%20Account,Client%20to%20finish%20the%20setup) . Below is a summary of the steps: 1. Access Zendesk Chat directly or through the top right "Chat" option in Zendesk product. 1. Navigate to "Settings" > "Account" > "API" > "Add API client". -1. Fill in client name, company, and redirect URLs (default: http://localhost:8080). +1. Fill in the client name, company, and redirect URLs (default: http://localhost:8080). 1. Record the "CLIENT_ID" and "SUBDOMAIN". 1. Format the below URL with your own CLIENT_ID and SUBDOMAIN, paste it into a new browser tab, and press Enter. @@ -136,12 +136,12 @@ To generate Zendesk chat OAuth token, please refer to this returned in the browser's URL field then it worked! ![Zendesk Chat](docs_images/Zendesk_chat_access_token.jpg) 1. Safely store the OAuth token to authenticate Zendesk Chat for retrieving data. -1. There are several other methods to obtain Zendesk chat token as given in the full +1. There are several other methods to obtain a Zendesk chat token as given in the full [documentation here.](https://support.zendesk.com/hc/en-us/articles/4408828740762-Chat-API-tutorial-Generating-an-OAuth-token-integrated-Chat-accounts-#:~:text=Create%20the%20OAuth%20API%20client,-First%20of%20all&text=Go%20to%20Zendesk%20Chat%20%3E%20Account,Client%20to%20finish%20the%20setup.) -### Zendesk Talk +### Zendesk talk -Zendesk talk fetches the data using Zendesk Tolk API. +Zendesk Talk fetches the data using the Zendesk Talk API. 1. Obtaining credentials for Zendesk Talk mirrors the process for [Zendesk support](#zendesk-support). @@ -197,7 +197,7 @@ For more information, read the guide on [how to add a verified source.](../../wa - Method 2 ([subdomain](#subdomain) + email address + [API token](#grab-zendesk-support-api-token)) - Method 3 ([subdomain](#subdomain) + [OAuth token](#zendesk-support-oauth-token)) - To load data from Zendesk Chat use the following method for authentication: + To load data from Zendesk Chat, use the following method for authentication: - Method 1 ([subdomain](#subdomain) + [OAuth token](#grab-zendesk-chat-oauth-token)) > Note: Use the Zendesk Support OAuth token for configuring Zendesk Support, and for @@ -262,7 +262,7 @@ run. ### Resource `talk_resource` -This function loads data from Zendesk talk endpoint. +This function loads data from the Zendesk Talk endpoint. ```py def talk_resource( @@ -276,14 +276,13 @@ def talk_resource( `zendesk_client`: An instance of ZendeskAPIClient for making API calls to Zendesk Talk. -`talk_endpoint_name`: The name of the talk_endpoint. +`talk_endpoint_name`: The name of the talk endpoint. `talk_endpoint`: The actual URL ending of the endpoint. -`pagination`: Type of pagination type used by endpoint. +`pagination_type`: Type of pagination used by the endpoint. - -Other functions similar to the source `zendesk_talk` and resources similar to `talk_endpoint` are: +Other functions similar to the source `zendesk_talk` and resources similar to `talk_endpoint` are: | Function Name | Type | Description | |---------------------------| --------- |---------------------------------------------------------------------------------------------------| @@ -292,11 +291,12 @@ Other functions similar to the source `zendesk_talk` and resources similar to `t | talk_incremental_resource | resource | Retrieves data incrementally from a Zendesk Talk endpoint. | | zendesk_support | source | Retrieves data from Zendesk Support for tickets, users, brands, organizations, and groups | | ticket_events | resource | Retrieves records of all changes made to a ticket, including state, etc. | -| tickets | resource | Retrieves the data for ticket table, the table can be pivoted and cols renamed | +| tickets | resource | Retrieves the data for the ticket table, which can be pivoted and columns renamed | | ticket_metric_events | resource | Retrieves ticket metric events from the start date, defaulting to January 1st of the current year | -| basic_resource | resource | Retrives basic loader for Zenpy endpoints with pagination support | +| basic_resource | resource | Retrieves basic loader for Zenpy endpoints with pagination support | ## Customization + ### Create your own pipeline If you wish to create your own pipelines, you can leverage source and resource methods from this @@ -312,27 +312,27 @@ verified source. ) ``` -1. To load data related to support, talk and chat: +1. To load data related to support, talk, and chat: ```py - #zendesk support source function + # Zendesk support source function data_support = zendesk_support(load_all=True) - # zendesk chat source function + # Zendesk chat source function data_chat = zendesk_chat() - # zendesk talk source function + # Zendesk talk source function data_talk = zendesk_talk() - # run pipeline with all 3 sources - info = pipeline.run([data_support,data_chat,data_talk]) + # Run pipeline with all 3 sources + info = pipeline.run([data_support, data_chat, data_talk]) print(info) ``` -1. To load data related to support, chat and talk in incremental mode: +1. To load data related to support, chat, and talk in incremental mode: ```py pipeline = dlt.pipeline( pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) - dev_mode = False, + dev_mode=False, dataset_name="sample_zendesk_data" # Use a custom name if desired ) data = zendesk_support(load_all=True, start_date=start_date) @@ -342,7 +342,7 @@ verified source. print(info) ``` - > Supports incremental loading for Support, Chat, and Talk Endpoints. By default, it fetches data + > Supports incremental loading for Support, Chat, and Talk endpoints. By default, it fetches data > from the last load time in the dlt state or from 1st Jan 2000 if no prior load. This approach > ensures data retrieval since the specified date, while still updating the last load time. @@ -350,13 +350,13 @@ verified source. for new tickets. ```py - # Load ranges of dates to load between January 1st 2023 and today + # Load ranges of dates between January 1st, 2023, and today min_start_date = pendulum.DateTime(year=2023, month=1, day=1).in_timezone("UTC") max_end_date = pendulum.today() # Generate tuples of date ranges, each with 1 week in between. ranges = make_date_ranges(min_start_date, max_end_date, timedelta(weeks=1)) - # Run the pipeline in a loop for each 1 week range + # Run the pipeline in a loop for each 1-week range for start, end in ranges: print(f"Loading tickets between {start} and {end}") data = zendesk_support(start_date=start, end_date=end).with_resources("tickets") @@ -374,3 +374,4 @@ verified source. > data. This approach can be used with all incremental Zendesk sources. + From 73ea048e672da74c2b1376d744a5a2cb201a83e8 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 1 Oct 2024 19:50:45 +0200 Subject: [PATCH 11/29] Docs: sort core sources in the sidebar by usage (#1898) --- docs/website/sidebars.js | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 32bb554842..d63684d3fc 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -64,19 +64,6 @@ const sidebars = { id: 'dlt-ecosystem/verified-sources/index', }, items: [ - { - type: 'category', - label: 'Filesystem & cloud storage', - description: 'AWS S3, Google Cloud Storage, Azure, SFTP, local file system', - link: { - type: 'doc', - id: 'dlt-ecosystem/verified-sources/filesystem/index', - }, - items: [ - 'dlt-ecosystem/verified-sources/filesystem/basic', - 'dlt-ecosystem/verified-sources/filesystem/advanced', - ] - }, { type: 'category', label: 'REST APIs', @@ -118,6 +105,19 @@ const sidebars = { 'dlt-ecosystem/verified-sources/sql_database/advanced', ] }, + { + type: 'category', + label: 'Filesystem & cloud storage', + description: 'AWS S3, Google Cloud Storage, Azure, SFTP, local file system', + link: { + type: 'doc', + id: 'dlt-ecosystem/verified-sources/filesystem/index', + }, + items: [ + 'dlt-ecosystem/verified-sources/filesystem/basic', + 'dlt-ecosystem/verified-sources/filesystem/advanced', + ] + }, 'dlt-ecosystem/verified-sources/airtable', 'dlt-ecosystem/verified-sources/amazon_kinesis', 'dlt-ecosystem/verified-sources/arrow-pandas', From 7f75a0f3fe5ac6f792f3e63d90190eccd12a8bba Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Wed, 2 Oct 2024 00:40:07 +0530 Subject: [PATCH 12/29] Added troubleshooting section to filesystem docs (#1900) Co-authored-by: Anton Burnashev --- .../dlt-ecosystem/destinations/filesystem.md | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 2be382c326..3e562dfb84 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -700,6 +700,31 @@ This destination fully supports [dlt state sync](../../general-usage/state#synci You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations. -**Note:** When a load generates a new state, for example when using incremental loads, a new state file appears in the `_dlt_pipeline_state` folder at the destination. To prevent data accumulation, state cleanup mechanisms automatically remove old state files, retaining only the latest 100 by default. This cleanup process can be customized or disabled using the filesystem configuration `max_state_files`, which determines the maximum number of pipeline state files to retain (default is 100). Setting this value to 0 or a negative number disables the cleanup of old states. +:::note +When a load generates a new state, for example when using incremental loads, a new state file appears in the `_dlt_pipeline_state` folder at the destination. To prevent data accumulation, state cleanup mechanisms automatically remove old state files, retaining only the latest 100 by default. This cleanup process can be customized or disabled using the filesystem configuration `max_state_files`, which determines the maximum number of pipeline state files to retain (default is 100). Setting this value to 0 or a negative number disables the cleanup of old states. +::: + +## Troubleshooting +### File Name Too Long Error +When running your pipeline, you might encounter an error like `[Errno 36] File name too long Error`. This error occurs because the generated file name exceeds the maximum allowed length on your filesystem. + +To prevent the file name length error, set the `max_identifier_length` parameter for your destination. This truncates all identifiers (including filenames) to a specified maximum length. +For example: + +```py +from dlt.destinations import duckdb + +pipeline = dlt.pipeline( + pipeline_name="your_pipeline_name", + destination=duckdb( + max_identifier_length=200, # Adjust the length as needed + ), +) +``` + +:::note +- `max_identifier_length` truncates all identifiers (tables, columns). Ensure the length maintains uniqueness to avoid collisions. +- Adjust `max_identifier_length` based on your data structure and filesystem limits. +::: From 90fc2aa8d02dfabea8a8c816dbae3d99842b6e0c Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 1 Oct 2024 21:35:50 +0200 Subject: [PATCH 13/29] Fix a typo in credentials/advanced.md (#1912) --- docs/website/docs/general-usage/credentials/advanced.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/general-usage/credentials/advanced.md b/docs/website/docs/general-usage/credentials/advanced.md index c25030a154..ad1adaa8f2 100644 --- a/docs/website/docs/general-usage/credentials/advanced.md +++ b/docs/website/docs/general-usage/credentials/advanced.md @@ -142,8 +142,8 @@ data_source = google_sheets( data_source.run(destination="bigquery") ``` -`dlt.config` and `dlt.secrets` behave like dictionaries from which you can request a value with any key name. `dlt` will look in all [config providers](setup) - env variables, TOML files, etc. to create these dictionaries. You can also use `dlt.config.get()` or `dlt.secrets.get()` to -request a value cast to a desired type. For example: +`dlt.config` and `dlt.secrets` behave like dictionaries from which you can request a value with any key name. `dlt` will look in all [config providers](setup) - environment variables, TOML files, etc. to create these dictionaries. You can also use `dlt.config.get()` or `dlt.secrets.get()` to +request a value and cast it to a desired type. For example: ```py credentials = dlt.secrets.get("my_section.gcp_credentials", GcpServiceAccountCredentials) From a76a06d9a8032e4bcb64739ce0384b35dbab0977 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 1 Oct 2024 21:36:49 +0200 Subject: [PATCH 14/29] Remove code markup in credentials/advanced.md (#1911) --- docs/website/docs/general-usage/credentials/advanced.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/credentials/advanced.md b/docs/website/docs/general-usage/credentials/advanced.md index ad1adaa8f2..f4b2ad0d11 100644 --- a/docs/website/docs/general-usage/credentials/advanced.md +++ b/docs/website/docs/general-usage/credentials/advanced.md @@ -8,7 +8,7 @@ keywords: [credentials, secrets.toml, secrets, config, configuration, environmen ## Injection mechanism -`dlt` has a special treatment for functions decorated with `@dlt.source`, `@dlt.resource`, and `@dlt.destination`. When such a function is called, `dlt` takes the argument names in the signature and supplies (`injects`) the required values by looking for them in [various config providers](setup). +`dlt` has a special treatment for functions decorated with `@dlt.source`, `@dlt.resource`, and `@dlt.destination`. When such a function is called, `dlt` takes the argument names in the signature and supplies (injects) the required values by looking for them in [various config providers](setup). ### Injection rules From c312fb411fa34bbb0f8ed140141dcc1af8ecb71e Mon Sep 17 00:00:00 2001 From: David Scharf Date: Wed, 2 Oct 2024 12:13:59 +0200 Subject: [PATCH 15/29] docs: grammar fix pages 100 - 120 (#1908) * grammar fix docs pages 100 to 120 * Apply suggestions from code review Co-authored-by: Alena Astrakhantseva * Update docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md Co-authored-by: Alena Astrakhantseva * Update docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md Co-authored-by: Alena Astrakhantseva --------- Co-authored-by: Alena Astrakhantseva Co-authored-by: Anton Burnashev --- .../verified-sources/filesystem/advanced.md | 35 ++-- .../verified-sources/filesystem/basic.md | 63 ++++---- .../verified-sources/filesystem/index.md | 5 +- .../verified-sources/rest_api/advanced.md | 11 +- .../verified-sources/rest_api/basic.md | 60 +++---- .../verified-sources/rest_api/index.md | 5 +- .../verified-sources/sql_database/advanced.md | 56 +++---- .../sql_database/configuration.md | 64 ++++---- .../visualizations/exploring-the-data.md | 59 +++---- docs/website/docs/examples/index.md | 7 +- .../docs/reference/command-line-interface.md | 105 ++++++------ .../reference/frequently-asked-questions.md | 10 +- docs/website/docs/reference/installation.md | 11 +- docs/website/docs/reference/performance.md | 134 +++++++-------- docs/website/docs/reference/telemetry.md | 45 ++---- docs/website/docs/reference/tracing.md | 4 +- .../docs/running-in-production/alerting.md | 12 +- .../docs/running-in-production/monitoring.md | 21 +-- .../docs/running-in-production/running.md | 153 ++++++++---------- .../docs/running-in-production/tracing.md | 15 +- 20 files changed, 418 insertions(+), 457 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/advanced.md index be08e9ff44..e1eeca0ee9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/advanced.md @@ -32,10 +32,10 @@ The filesystem ensures consistent file representation across bucket types and of #### `FileItem` fields -- `file_url` - complete URL of the file (e.g. `s3://bucket-name/path/file`). This field serves as a primary key. +- `file_url` - complete URL of the file (e.g., `s3://bucket-name/path/file`). This field serves as a primary key. - `file_name` - name of the file from the bucket URL. - `relative_path` - set when doing `glob`, is a relative path to a `bucket_url` argument. -- `mime_type` - file's mime type. It is sourced from the bucket provider or inferred from its extension. +- `mime_type` - file's MIME type. It is sourced from the bucket provider or inferred from its extension. - `modification_date` - file's last modification time (format: `pendulum.DateTime`). - `size_in_bytes` - file size. - `file_content` - content, provided upon request. @@ -90,7 +90,7 @@ example_xls = filesystem( bucket_url=BUCKET_URL, file_glob="../directory/example.xlsx" ) | read_excel("example_table") # Pass the data through the transformer to read the "example_table" sheet. -pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb", dataset_name="example_xls_data",) +pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb", dataset_name="example_xls_data") # Execute the pipeline and load the extracted data into the "duckdb" destination. load_info = pipeline.run(example_xls.with_name("example_xls_data")) # Print the loading information. @@ -119,7 +119,7 @@ def read_xml(items: Iterator[FileItemDict]) -> Iterator[TDataItems]: for file_obj in items: # Open the file object. with file_obj.open() as file: - # Parse the file to dict records + # Parse the file to dict records. yield xmltodict.parse(file.read()) # Set up the pipeline to fetch a specific XML file from a filesystem (bucket). @@ -143,14 +143,14 @@ You can get an fsspec client from the filesystem resource after it was extracted from dlt.sources.filesystem import filesystem, read_csv from dlt.sources.filesystem.helpers import fsspec_from_resource -# get filesystem source +# Get filesystem source. gs_resource = filesystem("gs://ci-test-bucket/") -# extract files +# Extract files. pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb") pipeline.run(gs_resource | read_csv()) -# get fs client +# Get fs client. fs_client = fsspec_from_resource(gs_resource) -# do any operation +# Do any operation. fs_client.ls("ci-test-bucket/standard_source/samples") ``` @@ -166,31 +166,32 @@ from dlt.common.storages.fsspec_filesystem import FileItemDict from dlt.sources.filesystem import filesystem def _copy(item: FileItemDict) -> FileItemDict: - # instantiate fsspec and copy file + # Instantiate fsspec and copy file dest_file = os.path.join(local_folder, item["file_name"]) - # create dest folder + # Create destination folder os.makedirs(os.path.dirname(dest_file), exist_ok=True) - # download file + # Download file item.fsspec.download(item["file_url"], dest_file) - # return file item unchanged + # Return file item unchanged return item BUCKET_URL = "gs://ci-test-bucket/" -# use recursive glob pattern and add file copy step +# Use recursive glob pattern and add file copy step downloader = filesystem(BUCKET_URL, file_glob="**").add_map(_copy) -# NOTE: you do not need to load any data to execute extract, below we obtain +# NOTE: You do not need to load any data to execute extract; below, we obtain # a list of files in a bucket and also copy them locally listing = list(downloader) print(listing) -# download to table "listing" +# Download to table "listing" pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb") load_info = pipeline.run( downloader.with_name("listing"), write_disposition="replace" ) -# pretty print the information on data that was loaded +# Pretty print the information on data that was loaded print(load_info) print(listing) print(pipeline.last_trace.last_normalize_info) -``` \ No newline at end of file +``` + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md index 6eb02b4edf..5ae7de82da 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md @@ -10,7 +10,7 @@ Filesystem source allows loading files from remote locations (AWS S3, Google Clo To load unstructured data (`.pdf`, `.txt`, e-mail), please refer to the [unstructured data source](https://github.com/dlt-hub/verified-sources/tree/master/sources/unstructured_data). -## How Filesystem source works? +## How filesystem source works The Filesystem source doesn't just give you an easy way to load data from both remote and local files — it also comes with a powerful set of tools that let you customize the loading process to fit your specific needs. @@ -54,7 +54,7 @@ To get started with your data pipeline, follow these steps: dlt init filesystem duckdb ``` - [dlt init command](../../../reference/command-line-interface) will initialize + The [dlt init command](../../../reference/command-line-interface) will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/filesystem_pipeline.py) with the filesystem as the source and [duckdb](../../destinations/duckdb.md) as the destination. @@ -66,6 +66,8 @@ To get started with your data pipeline, follow these steps: ## Configuration + + ### Get credentials @@ -145,7 +147,7 @@ You don't need any credentials for the local filesystem. To provide credentials to the filesystem source, you can use [any method available](../../../general-usage/credentials/setup#available-config-providers) in `dlt`. One of the easiest ways is to use configuration files. The `.dlt` folder in your working directory -contains two files: `config.toml` and `secrets.toml`. Sensitive information, like passwords and +contains two files: `config.toml` and `secrets.toml`. Sensitive information, like passwords and access tokens, should only be put into `secrets.toml`, while any other configuration, like the path to a bucket, can be specified in `config.toml`. @@ -212,7 +214,7 @@ bucket_url="gs:////" Learn how to set up SFTP credentials for each authentication method in the [SFTP section](../../destinations/filesystem#sftp). -For example, in case of key-based authentication, you can configure the source the following way: +For example, in the case of key-based authentication, you can configure the source the following way: ```toml # secrets.toml @@ -229,7 +231,7 @@ bucket_url = "sftp://[hostname]/[path]" -You can use both native local filesystem paths and `file://` URI. Absolute, relative, and UNC Windows paths are supported. +You can use both native local filesystem paths and the `file://` URI. Absolute, relative, and UNC Windows paths are supported. You could provide an absolute filepath: @@ -239,7 +241,7 @@ You could provide an absolute filepath: bucket_url='file://Users/admin/Documents/csv_files' ``` -Or skip the schema and provide the local path in a format native for your operating system. For example, for Windows: +Or skip the schema and provide the local path in a format native to your operating system. For example, for Windows: ```toml [sources.filesystem] @@ -250,7 +252,7 @@ bucket_url='~\Documents\csv_files\' -You can also specify the credentials using Environment variables. The name of the corresponding environment +You can also specify the credentials using environment variables. The name of the corresponding environment variable should be slightly different from the corresponding name in the `toml` file. Simply replace dots `.` with double underscores `__`: @@ -260,7 +262,7 @@ export SOURCES__FILESYSTEM__AWS_SECRET_ACCESS_KEY = "Please set me up!" ``` :::tip -`dlt` supports more ways of authorizing with the cloud storage, including identity-based +`dlt` supports more ways of authorizing with cloud storage, including identity-based and default credentials. To learn more about adding credentials to your pipeline, please refer to the [Configuration and secrets section](../../../general-usage/credentials/complex_types#gcp-credentials). ::: @@ -310,7 +312,7 @@ or taken from the config: Full list of `filesystem` resource parameters: * `bucket_url` - full URL of the bucket (could be a relative path in the case of the local filesystem). -* `credentials` - cloud storage credentials of `AbstractFilesystem` instance (should be empty for the local filesystem). We recommend not to specify this parameter in the code, but put it in secrets file instead. +* `credentials` - cloud storage credentials of `AbstractFilesystem` instance (should be empty for the local filesystem). We recommend not specifying this parameter in the code, but putting it in a secrets file instead. * `file_glob` - file filter in glob format. Defaults to listing all non-recursive files in the bucket URL. * `files_per_page` - number of files processed at once. The default value is `100`. * `extract_content` - if true, the content of the file will be read and returned in the resource. The default value is `False`. @@ -332,15 +334,15 @@ filesystem_pipe = filesystem( #### Available transformers -- `read_csv()` - process `csv` files using `pandas` -- `read_jsonl()` - process `jsonl` files chuck by chunk -- `read_parquet()` - process `parquet` files using `pyarrow` -- `read_csv_duckdb()` - this transformer process `csv` files using DuckDB, which usually shows better performance, than `pandas`. +- `read_csv()` - processes `csv` files using `pandas` +- `read_jsonl()` - processes `jsonl` files chunk by chunk +- `read_parquet()` - processes `parquet` files using `pyarrow` +- `read_csv_duckdb()` - this transformer processes `csv` files using DuckDB, which usually shows better performance than `pandas`. :::tip We advise that you give each resource a [specific name](../../../general-usage/resource#duplicate-and-rename-resources) -before loading with `pipeline.run`. This will make sure that data goes to a table with the name you +before loading with `pipeline.run`. This will ensure that data goes to a table with the name you want and that each pipeline uses a [separate state for incremental loading.](../../../general-usage/state#read-and-write-pipeline-state-in-a-resource) ::: @@ -366,7 +368,7 @@ import dlt from dlt.sources.filesystem import filesystem, read_csv filesystem_pipe = filesystem(bucket_url="file://Users/admin/Documents/csv_files", file_glob="*.csv") | read_csv() -# tell dlt to merge on date +# Tell dlt to merge on date filesystem_pipe.apply_hints(write_disposition="merge", merge_key="date") # We load the data into the table_name table @@ -380,19 +382,19 @@ print(load_info) Here are a few simple ways to load your data incrementally: 1. [Load files based on modification date](#load-files-based-on-modification-date). Only load files that have been updated since the last time `dlt` processed them. `dlt` checks the files' metadata (like the modification date) and skips those that haven't changed. -2. [Load new records based on a specific column](#load-new-records-based-on-a-specific-column). You can load only the new or updated records by looking at a specific column, like `updated_at`. Unlike the first method, this approach would read all files every time and then filter the records which was updated. -3. [Combine loading only updated files and records](#combine-loading-only-updated-files-and-records). Finally, you can combine both methods. It could be useful if new records could be added to existing files, so you not only want to filter the modified files, but modified records as well. +2. [Load new records based on a specific column](#load-new-records-based-on-a-specific-column). You can load only the new or updated records by looking at a specific column, like `updated_at`. Unlike the first method, this approach would read all files every time and then filter the records which were updated. +3. [Combine loading only updated files and records](#combine-loading-only-updated-files-and-records). Finally, you can combine both methods. It could be useful if new records could be added to existing files, so you not only want to filter the modified files, but also the modified records. #### Load files based on modification date -For example, to load only new CSV files with [incremental loading](../../../general-usage/incremental-loading) you can use `apply_hints` method. +For example, to load only new CSV files with [incremental loading](../../../general-usage/incremental-loading), you can use the `apply_hints` method. ```py import dlt from dlt.sources.filesystem import filesystem, read_csv -# This configuration will only consider new csv files +# This configuration will only consider new CSV files new_files = filesystem(bucket_url="s3://bucket_name", file_glob="directory/*.csv") -# add incremental on modification time +# Add incremental on modification time new_files.apply_hints(incremental=dlt.sources.incremental("modification_date")) pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb") @@ -402,13 +404,13 @@ print(load_info) #### Load new records based on a specific column -In this example we load only new records based on the field called `updated_at`. This method may be useful if you are not able to -filter files by modification date because for example, all files are modified each time new record is appeared. +In this example, we load only new records based on the field called `updated_at`. This method may be useful if you are not able to +filter files by modification date because, for example, all files are modified each time a new record appears. ```py import dlt from dlt.sources.filesystem import filesystem, read_csv -# We consider all csv files +# We consider all CSV files all_files = filesystem(bucket_url="s3://bucket_name", file_glob="directory/*.csv") # But filter out only updated records @@ -425,11 +427,11 @@ print(load_info) import dlt from dlt.sources.filesystem import filesystem, read_csv -# This configuration will only consider modified csv files +# This configuration will only consider modified CSV files new_files = filesystem(bucket_url="s3://bucket_name", file_glob="directory/*.csv") new_files.apply_hints(incremental=dlt.sources.incremental("modification_date")) -# And in each modified file we filter out only updated records +# And in each modified file, we filter out only updated records filesystem_pipe = (new_files | read_csv()) filesystem_pipe.apply_hints(incremental=dlt.sources.incremental("updated_at")) pipeline = dlt.pipeline(pipeline_name="my_pipeline", destination="duckdb") @@ -459,7 +461,7 @@ print(load_info) ``` :::tip -You could also use `file_glob` to filter files by names. It works very well in simple cases, for example, filtering by extention: +You could also use `file_glob` to filter files by names. It works very well in simple cases, for example, filtering by extension: ```py from dlt.sources.filesystem import filesystem @@ -493,8 +495,8 @@ print(load_info) Windows supports paths up to 255 characters. When you access a path longer than 255 characters, you'll see a `FileNotFound` exception. - To go over this limit, you can use [extended paths](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry). - **Note that Python glob does not work with extended UNC paths**, so you will not be able to use them +To go over this limit, you can use [extended paths](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry). +**Note that Python glob does not work with extended UNC paths**, so you will not be able to use them ```toml [sources.filesystem] @@ -514,4 +516,5 @@ function to configure the resource correctly. Use `**` to include recursive file filesystem supports full Python [glob](https://docs.python.org/3/library/glob.html#glob.glob) functionality, while cloud storage supports a restricted `fsspec` [version](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.glob). - \ No newline at end of file + + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md index 1441931340..0aaa07b0c3 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/index.md @@ -12,8 +12,9 @@ The Filesystem source allows seamless loading of files from the following locati * remote filesystem (via SFTP) * local filesystem -The Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured files. +The Filesystem source natively supports `csv`, `parquet`, and `jsonl` files and allows customization for loading any type of structured file. import DocCardList from '@theme/DocCardList'; - \ No newline at end of file + + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/advanced.md index 27d2cc0b6e..26add81def 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/advanced.md @@ -9,15 +9,15 @@ keywords: [rest api, restful api] - `config`: The REST API configuration dictionary. - `name`: An optional name for the source. - `section`: An optional section name in the configuration file. -- `max_table_nesting`: Sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. -- `root_key` (bool): Enables merging on all resources by propagating root foreign key to nested tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge. Defaults to False. +- `max_table_nesting`: Sets the maximum depth of nested tables above which the remaining nodes are loaded as structs or JSON. +- `root_key` (bool): Enables merging on all resources by propagating the root foreign key to nested tables. This option is most useful if you plan to change the write disposition of a resource to disable/enable merge. Defaults to False. - `schema_contract`: Schema contract settings that will be applied to this resource. - `spec`: A specification of configuration and secret values required by the source. ### Response actions The `response_actions` field in the endpoint configuration allows you to specify how to handle specific responses or all responses from the API. For example, responses with specific status codes or content substrings can be ignored. -Additionally, all responses or only responses with specific status codes or content substrings can be transformed with a custom callable, such as a function. This callable is passed on to the requests library as a [response hook](https://requests.readthedocs.io/en/latest/user/advanced/#event-hooks). The callable can modify the response object and has to return it for the modifications to take effect. +Additionally, all responses or only responses with specific status codes or content substrings can be transformed with a custom callable, such as a function. This callable is passed on to the requests library as a [response hook](https://requests.readthedocs.io/en/latest/user/advanced/#event-hooks). The callable can modify the response object and must return it for the modifications to take effect. :::caution Experimental Feature This is an experimental feature and may change in future releases. @@ -55,7 +55,7 @@ from requests.models import Response from dlt.common import json def set_encoding(response, *args, **kwargs): - # sets the encoding in case it's not correctly detected + # Sets the encoding in case it's not correctly detected response.encoding = 'windows-1252' return response @@ -99,7 +99,7 @@ In this example, the resource will set the correct encoding for all responses fi ```py def set_encoding(response, *args, **kwargs): - # sets the encoding in case it's not correctly detected + # Sets the encoding in case it's not correctly detected response.encoding = 'windows-1252' return response @@ -122,3 +122,4 @@ source_config = { ``` In this example, the resource will set the correct encoding for all responses. More callables can be added to the list of response_actions. + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md index 121769a11a..03214950f4 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md @@ -62,7 +62,7 @@ pipeline = dlt.pipeline( load_info = pipeline.run(source) ``` -Running this pipeline will create two tables in the DuckDB: `posts` and `comments` with the data from the respective API endpoints. The `comments` resource will fetch comments for each post by using the `id` field from the `posts` resource. +Running this pipeline will create two tables in DuckDB: `posts` and `comments` with the data from the respective API endpoints. The `comments` resource will fetch comments for each post by using the `id` field from the `posts` resource. ## Setup @@ -132,9 +132,11 @@ github_token = "your_github_token" ## Source configuration + + ### Quick example -Let's take a look at the GitHub example in `rest_api_pipeline.py` file: +Let's take a look at the GitHub example in the `rest_api_pipeline.py` file: ```py from dlt.sources.rest_api import RESTAPIConfig, rest_api_resources @@ -206,14 +208,14 @@ def load_github() -> None: The declarative resource configuration is defined in the `config` dictionary. It contains the following key components: -1. `client`: Defines the base URL and authentication method for the API. In this case it uses token-based authentication. The token is stored in the `secrets.toml` file. +1. `client`: Defines the base URL and authentication method for the API. In this case, it uses token-based authentication. The token is stored in the `secrets.toml` file. 2. `resource_defaults`: Contains default settings for all [resources](#resource-configuration). In this example, we define that all resources: - Have `id` as the [primary key](../../../general-usage/resource#define-schema) - Use the `merge` [write disposition](../../../general-usage/incremental-loading#choosing-a-write-disposition) to merge the data with the existing data in the destination. - - Send a `per_page` query parameter with each request to 100 to get more results per page. + - Send a `per_page=100` query parameter with each request to get more results per page. -3. `resources`: A list of [resources](#resource-configuration) to be loaded. Here, we have two resources: `issues` and `issue_comments`, which correspond to the GitHub API endpoints for [repository issues](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) and [issue comments](https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#list-issue-comments). Note that we need a in issue number to fetch comments for each issue. This number is taken from the `issues` resource. More on this in the [resource relationships](#define-resource-relationships) section. +3. `resources`: A list of [resources](#resource-configuration) to be loaded. Here, we have two resources: `issues` and `issue_comments`, which correspond to the GitHub API endpoints for [repository issues](https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues) and [issue comments](https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#list-issue-comments). Note that we need an issue number to fetch comments for each issue. This number is taken from the `issues` resource. More on this in the [resource relationships](#define-resource-relationships) section. Let's break down the configuration in more detail. @@ -227,7 +229,6 @@ from dlt.sources.rest_api import RESTAPIConfig ``` ::: - The configuration object passed to the REST API Generic Source has three main elements: ```py @@ -297,7 +298,7 @@ Both `resource1` and `resource2` will have the `per_page` parameter set to 100. This is a list of resource configurations that define the API endpoints to be loaded. Each resource configuration can be: - a dictionary with the [resource configuration](#resource-configuration). -- a string. In this case, the string is used as the both as the endpoint path and the resource name, and the resource configuration is taken from the `resource_defaults` configuration if it exists. +- a string. In this case, the string is used as both the endpoint path and the resource name, and the resource configuration is taken from the `resource_defaults` configuration if it exists. ### Resource configuration @@ -337,7 +338,7 @@ The endpoint configuration defines how to query the API endpoint. Quick example: The fields in the endpoint configuration are: - `path`: The path to the API endpoint. -- `method`: The HTTP method to be used. Default is `GET`. +- `method`: The HTTP method to be used. The default is `GET`. - `params`: Query parameters to be sent with each request. For example, `sort` to order the results or `since` to specify [incremental loading](#incremental-loading). This is also used to define [resource relationships](#define-resource-relationships). - `json`: The JSON payload to be sent with the request (for POST and PUT requests). - `paginator`: Pagination configuration for the endpoint. See the [pagination](#pagination) section for more details. @@ -398,7 +399,7 @@ from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator ``` :::note -Currently pagination is supported only for GET requests. To handle POST requests with pagination, you need to implement a [custom paginator](../../../general-usage/http/rest-client.md#custom-paginator). +Currently, pagination is supported only for GET requests. To handle POST requests with pagination, you need to implement a [custom paginator](../../../general-usage/http/rest-client.md#custom-paginator). ::: These are the available paginators: @@ -407,9 +408,9 @@ These are the available paginators: | ------------ | -------------- | ----------- | | `json_link` | [JSONLinkPaginator](../../../general-usage/http/rest-client.md#jsonresponsepaginator) | The link to the next page is in the body (JSON) of the response.
*Parameters:*
  • `next_url_path` (str) - the JSONPath to the next page URL
| | `header_link` | [HeaderLinkPaginator](../../../general-usage/http/rest-client.md#headerlinkpaginator) | The links to the next page are in the response headers.
*Parameters:*
  • `links_next_key` (str) - the name of the header containing the links. Default is "next".
| -| `offset` | [OffsetPaginator](../../../general-usage/http/rest-client.md#offsetpaginator) | The pagination is based on an offset parameter. With total items count either in the response body or explicitly provided.
*Parameters:*
  • `limit` (int) - the maximum number of items to retrieve in each request
  • `offset` (int) - the initial offset for the first request. Defaults to `0`
  • `offset_param` (str) - the name of the query parameter used to specify the offset. Defaults to "offset"
  • `limit_param` (str) - the name of the query parameter used to specify the limit. Defaults to "limit"
  • `total_path` (str) - a JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`
  • `maximum_offset` (int) - optional maximum offset value. Limits pagination even without total count
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| -| `page_number` | [PageNumberPaginator](../../../general-usage/http/rest-client.md#pagenumberpaginator) | The pagination is based on a page number parameter. With total pages count either in the response body or explicitly provided.
*Parameters:*
  • `base_page` (int) - the starting page number. Defaults to `0`
  • `page_param` (str) - the query parameter name for the page number. Defaults to "page"
  • `total_path` (str) - a JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page` and `stop_after_empty_page`
  • `maximum_page` (int) - optional maximum page number. Stops pagination once this page is reached
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| -| `cursor` | [JSONResponseCursorPaginator](../../../general-usage/http/rest-client.md#jsonresponsecursorpaginator) | The pagination is based on a cursor parameter. The value of the cursor is in the response body (JSON).
*Parameters:*
  • `cursor_path` (str) - the JSONPath to the cursor value. Defaults to "cursors.next"
  • `cursor_param` (str) - the query parameter name for the cursor. Defaults to "after"
| +| `offset` | [OffsetPaginator](../../../general-usage/http/rest-client.md#offsetpaginator) | The pagination is based on an offset parameter, with the total items count either in the response body or explicitly provided.
*Parameters:*
  • `limit` (int) - the maximum number of items to retrieve in each request
  • `offset` (int) - the initial offset for the first request. Defaults to `0`
  • `offset_param` (str) - the name of the query parameter used to specify the offset. Defaults to "offset"
  • `limit_param` (str) - the name of the query parameter used to specify the limit. Defaults to "limit"
  • `total_path` (str) - a JSONPath expression for the total number of items. If not provided, pagination is controlled by `maximum_offset` and `stop_after_empty_page`
  • `maximum_offset` (int) - optional maximum offset value. Limits pagination even without total count
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| +| `page_number` | [PageNumberPaginator](../../../general-usage/http/rest-client.md#pagenumberpaginator) | The pagination is based on a page number parameter, with the total pages count either in the response body or explicitly provided.
*Parameters:*
  • `base_page` (int) - the starting page number. Defaults to `0`
  • `page_param` (str) - the query parameter name for the page number. Defaults to "page"
  • `total_path` (str) - a JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page` and `stop_after_empty_page`
  • `maximum_page` (int) - optional maximum page number. Stops pagination once this page is reached
  • `stop_after_empty_page` (bool) - Whether pagination should stop when a page contains no result items. Defaults to `True`
| +| `cursor` | [JSONResponseCursorPaginator](../../../general-usage/http/rest-client.md#jsonresponsecursorpaginator) | The pagination is based on a cursor parameter, with the value of the cursor in the response body (JSON).
*Parameters:*
  • `cursor_path` (str) - the JSONPath to the cursor value. Defaults to "cursors.next"
  • `cursor_param` (str) - the query parameter name for the cursor. Defaults to "after"
| | `single_page` | SinglePagePaginator | The response will be interpreted as a single-page response, ignoring possible pagination metadata. | | `auto` | `None` | Explicitly specify that the source should automatically detect the pagination method. | @@ -431,7 +432,7 @@ rest_api.config_setup.register_paginator("custom_paginator", CustomPaginator) ### Data selection -The `data_selector` field in the endpoint configuration allows you to specify a JSONPath to select the data from the response. By default, the source will try to detect locations of the data automatically. +The `data_selector` field in the endpoint configuration allows you to specify a JSONPath to select the data from the response. By default, the source will try to detect the locations of the data automatically. Use this field when you need to specify the location of the data in the response explicitly. @@ -481,7 +482,6 @@ You can use the following endpoint configuration: Read more about [JSONPath syntax](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) to learn how to write selectors. - ### Authentication For APIs that require authentication to access their endpoints, the REST API source supports various authentication methods, including token-based authentication, query parameters, basic authentication, and custom authentication. The authentication configuration is specified in the `auth` field of the [client](#client) either as a dictionary or as an instance of the [authentication class](../../../general-usage/http/rest-client.md#authentication). @@ -510,7 +510,7 @@ Available authentication types: | Authentication class | String Alias (`type`) | Description | | ------------------- | ----------- | ----------- | -| [BearTokenAuth](../../../general-usage/http/rest-client.md#bearer-token-authentication) | `bearer` | Bearer token authentication. | +| [BearerTokenAuth](../../../general-usage/http/rest-client.md#bearer-token-authentication) | `bearer` | Bearer token authentication. | | [HTTPBasicAuth](../../../general-usage/http/rest-client.md#http-basic-authentication) | `http_basic` | Basic HTTP authentication. | | [APIKeyAuth](../../../general-usage/http/rest-client.md#api-key-authentication) | `api_key` | API key authentication with key defined in the query parameters or in the headers. | | [OAuth2ClientCredentials](../../../general-usage/http/rest-client.md#oauth20-authorization) | N/A | OAuth 2.0 authorization with a temporary access token obtained from the authorization server. | @@ -537,7 +537,7 @@ from dlt.sources.helpers.rest_client.auth import BearerTokenAuth config = { "client": { - "auth": BearTokenAuth(dlt.secrets["your_api_token"]), + "auth": BearerTokenAuth(dlt.secrets["your_api_token"]), }, # ... } @@ -551,7 +551,7 @@ Available authentication types: | `type` | Authentication class | Description | | ----------- | ------------------- | ----------- | -| `bearer` | [BearTokenAuth](../../../general-usage/http/rest-client.md#bearer-token-authentication) | Bearer token authentication.
Parameters:
  • `token` (str)
| +| `bearer` | [BearerTokenAuth](../../../general-usage/http/rest-client.md#bearer-token-authentication) | Bearer token authentication.
Parameters:
  • `token` (str)
| | `http_basic` | [HTTPBasicAuth](../../../general-usage/http/rest-client.md#http-basic-authentication) | Basic HTTP authentication.
Parameters:
  • `username` (str)
  • `password` (str)
| | `api_key` | [APIKeyAuth](../../../general-usage/http/rest-client.md#api-key-authentication) | API key authentication with key defined in the query parameters or in the headers.
Parameters:
  • `name` (str) - the name of the query parameter or header
  • `api_key` (str) - the API key value
  • `location` (str, optional) - the location of the API key in the request. Can be `query` or `header`. Default is `header`
| @@ -572,10 +572,9 @@ rest_api.config_setup.register_auth("custom_auth", CustomAuth) } ``` - ### Define resource relationships -When you have a resource that depends on another resource, you can define the relationship using the `resolve` configuration. With it you link a path parameter in the child resource to a field in the parent resource's data. +When you have a resource that depends on another resource, you can define the relationship using the `resolve` configuration. With it, you link a path parameter in the child resource to a field in the parent resource's data. In the GitHub example, the `issue_comments` resource depends on the `issues` resource. The `issue_number` parameter in the `issue_comments` endpoint configuration is resolved from the `number` field of the `issues` resource: @@ -653,7 +652,7 @@ You can include data from the parent resource in the child resource by using the } ``` -This will include the `id`, `title`, and `created_at` fields from the `issues` resource in the `issue_comments` resource data. The name of the included fields will be prefixed with the parent resource name and an underscore (`_`) like so: `_issues_id`, `_issues_title`, `_issues_created_at`. +This will include the `id`, `title`, and `created_at` fields from the `issues` resource in the `issue_comments` resource data. The names of the included fields will be prefixed with the parent resource name and an underscore (`_`) like so: `_issues_id`, `_issues_title`, `_issues_created_at`. ### Define a resource which is not a REST endpoint @@ -661,7 +660,7 @@ Sometimes, we want to request endpoints with specific values that are not return Thus, you can also include arbitrary dlt resources in your `RESTAPIConfig` instead of defining a resource for every path! In the following example, we want to load the issues belonging to three repositories. -Instead of defining now three different issues resources, one for each of the paths `dlt-hub/dlt/issues/`, `dlt-hub/verified-sources/issues/`, `dlt-hub/dlthub-education/issues/`, we have a resource `repositories` which yields a list of repository names which will be fetched by the dependent resource `issues`. +Instead of defining three different issues resources, one for each of the paths `dlt-hub/dlt/issues/`, `dlt-hub/verified-sources/issues/`, `dlt-hub/dlthub-education/issues/`, we have a resource `repositories` which yields a list of repository names that will be fetched by the dependent resource `issues`. ```py from dlt.sources.rest_api import RESTAPIConfig @@ -830,7 +829,7 @@ For example, if we query the endpoint with `https://api.example.com/posts?create } ``` -To enable the incremental loading for this endpoint, you can use the following endpoint configuration: +To enable incremental loading for this endpoint, you can use the following endpoint configuration: ```py { @@ -851,7 +850,7 @@ So in our case, the next request will be made to `https://api.example.com/posts? Let's break down the configuration. -1. We explicitly set `data_selector` to `"results"` to select the list of posts from the response. This is optional, if not set, dlt will try to auto-detect the data location. +1. We explicitly set `data_selector` to `"results"` to select the list of posts from the response. This is optional; if not set, dlt will try to auto-detect the data location. 2. We define the `created_since` parameter as an incremental parameter with the following fields: ```py @@ -865,7 +864,7 @@ Let's break down the configuration. ``` - `type`: The type of the parameter definition. In this case, it must be set to `incremental`. -- `cursor_path`: The JSONPath to the field within each item in the list. The value of this field will be used in the next request. In the example above our items look like `{"id": 1, "title": "Post 1", "created_at": "2024-01-26"}` so to track the created time we set `cursor_path` to `"created_at"`. Note that the JSONPath starts from the root of the item (dict) and not from the root of the response. +- `cursor_path`: The JSONPath to the field within each item in the list. The value of this field will be used in the next request. In the example above, our items look like `{"id": 1, "title": "Post 1", "created_at": "2024-01-26"}` so to track the created time, we set `cursor_path` to `"created_at"`. Note that the JSONPath starts from the root of the item (dict) and not from the root of the response. - `initial_value`: The initial value for the cursor. This is the value that will initialize the state of incremental loading. In this case, it's `2024-01-25`. The value type should match the type of the field in the data item. ### Incremental loading using the `incremental` field @@ -906,7 +905,7 @@ The full available configuration for the `incremental` field is: The fields are: - `start_param` (str): The name of the query parameter to be used as the start condition. If we use the example above, it would be `"created_since"`. -- `end_param` (str): The name of the query parameter to be used as the end condition. This is optional and can be omitted if you only need to track the start condition. This is useful when you need to fetch data within a specific range and the API supports end conditions (like `created_before` query parameter). +- `end_param` (str): The name of the query parameter to be used as the end condition. This is optional and can be omitted if you only need to track the start condition. This is useful when you need to fetch data within a specific range and the API supports end conditions (like the `created_before` query parameter). - `cursor_path` (str): The JSONPath to the field within each item in the list. This is the field that will be used to track the incremental loading. In the example above, it's `"created_at"`. - `initial_value` (str): The initial value for the cursor. This is the value that will initialize the state of incremental loading. - `end_value` (str): The end value for the cursor to stop the incremental loading. This is optional and can be omitted if you only need to track the start condition. If you set this field, `initial_value` needs to be set as well. @@ -920,7 +919,7 @@ If you encounter issues with incremental loading, see the [troubleshooting secti If you need to transform the values in the cursor field before passing them to the API endpoint, you can specify a callable under the key `convert`. For example, the API might return UNIX epoch timestamps but expects to be queried with an ISO 8601 date. To achieve that, we can specify a function that converts from the date format returned by the API to the date format required for API requests. -In the following examples, `1704067200` is returned from the API in the field `updated_at` but the API will be called with `?created_since=2024-01-01`. +In the following examples, `1704067200` is returned from the API in the field `updated_at`, but the API will be called with `?created_since=2024-01-01`. Incremental loading using the `params` field: ```py @@ -963,7 +962,7 @@ This also provides details on the HTTP requests. #### Getting validation errors -When you running the pipeline and getting a `DictValidationException`, it means that the [source configuration](#source-configuration) is incorrect. The error message provides details on the issue including the path to the field and the expected type. +When you are running the pipeline and getting a `DictValidationException`, it means that the [source configuration](#source-configuration) is incorrect. The error message provides details on the issue, including the path to the field and the expected type. For example, if you have a source configuration like this: @@ -1015,7 +1014,7 @@ If incorrect data is received from an endpoint, check the `data_selector` field #### Getting insufficient data or incorrect pagination -Check the `paginator` field in the configuration. When not explicitly specified, the source tries to auto-detect the pagination method. If auto-detection fails, or the system is unsure, a warning is logged. For production environments, we recommend to specify an explicit paginator in the configuration. See the [pagination](#pagination) section for more details. Some APIs may have non-standard pagination methods, and you may need to implement a [custom paginator](../../../general-usage/http/rest-client.md#implementing-a-custom-paginator). +Check the `paginator` field in the configuration. When not explicitly specified, the source tries to auto-detect the pagination method. If auto-detection fails, or the system is unsure, a warning is logged. For production environments, we recommend specifying an explicit paginator in the configuration. See the [pagination](#pagination) section for more details. Some APIs may have non-standard pagination methods, and you may need to implement a [custom paginator](../../../general-usage/http/rest-client.md#implementing-a-custom-paginator). #### Incremental loading not working @@ -1023,11 +1022,11 @@ See the [troubleshooting guide](../../../general-usage/incremental-loading.md#tr #### Getting HTTP 404 errors -Some API may return 404 errors for resources that do not exist or have no data. Manage these responses by configuring the `ignore` action in [response actions](./advanced#response-actions). +Some APIs may return 404 errors for resources that do not exist or have no data. Manage these responses by configuring the `ignore` action in [response actions](./advanced#response-actions). ### Authentication issues -If experiencing 401 (Unauthorized) errors, this could indicate: +If you are experiencing 401 (Unauthorized) errors, this could indicate: - Incorrect authorization credentials. Verify credentials in the `secrets.toml`. Refer to [Secret and configs](../../../general-usage/credentials/setup#understanding-the-exceptions) for more information. - An incorrect authentication type. Consult the API documentation for the proper method. See the [authentication](#authentication) section for details. For some APIs, a [custom authentication method](../../../general-usage/http/rest-client.md#custom-authentication) may be required. @@ -1037,3 +1036,4 @@ If experiencing 401 (Unauthorized) errors, this could indicate: The `rest_api` source uses the [RESTClient](../../../general-usage/http/rest-client.md) class for HTTP requests. Refer to the RESTClient [troubleshooting guide](../../../general-usage/http/rest-client.md#troubleshooting) for debugging tips. For further assistance, join our [Slack community](https://dlthub.com/community). We're here to help! + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/index.md index dd9a77e297..f92d38f87e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/index.md @@ -11,8 +11,9 @@ You can use the REST API source to extract data from any REST API. Using a [decl * how to handle [pagination](./basic.md#pagination), * [authentication](./basic.md#authentication). -dlt will take care of the rest: unnesting the data, inferring the schema etc, and writing to the destination. +dlt will take care of the rest: unnesting the data, inferring the schema, etc., and writing to the destination. import DocCardList from '@theme/DocCardList'; - \ No newline at end of file + + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md index 708b195456..74012b4311 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md @@ -6,30 +6,28 @@ keywords: [sql connector, sql database pipeline, sql database] import Header from '../_source-info-header.md'; -# Advanced Usage +# Advanced usage
-## Incremental Loading +## Incremental loading Efficient data management often requires loading only new or updated data from your SQL databases, rather than reprocessing the entire dataset. This is where incremental loading comes into play. Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing ID) to load only data newer than a specified initial value, enhancing efficiency by reducing processing time and resource use. Read [here](../../../walkthroughs/sql-incremental-configuration) for more details on incremental loading with `dlt`. - #### How to configure -1. **Choose a Cursor Column**: Identify a column in your SQL table that can serve as a reliable indicator of new or updated rows. Common choices include timestamp columns or auto-incrementing IDs. -1. **Set an Initial Value**: Choose a starting value for the cursor to begin loading data. This could be a specific timestamp or ID from which you wish to start loading data. +1. **Choose a cursor column**: Identify a column in your SQL table that can serve as a reliable indicator of new or updated rows. Common choices include timestamp columns or auto-incrementing IDs. +1. **Set an initial value**: Choose a starting value for the cursor to begin loading data. This could be a specific timestamp or ID from which you wish to start loading data. 1. **Deduplication**: When using incremental loading, the system automatically handles the deduplication of rows based on the primary key (if available) or row hash for tables without a primary key. -1. **Set end_value for backfill**: Set `end_value` if you want to backfill data from -certain range. -1. **Order returned rows**. Set `row_order` to `asc` or `desc` to order returned rows. +1. **Set end_value for backfill**: Set `end_value` if you want to backfill data from a certain range. +1. **Order returned rows**: Set `row_order` to `asc` or `desc` to order returned rows. #### Examples 1. **Incremental loading with the resource `sql_table`**. - Consider a table "family" with a timestamp column `last_modified` that indicates when a row was last modified. To ensure that only rows modified after midnight (00:00:00) on January 1, 2024, are loaded, you would set `last_modified` timestamp as the cursor as follows: + Consider a table "family" with a timestamp column `last_modified` that indicates when a row was last modified. To ensure that only rows modified after midnight (00:00:00) on January 1, 2024, are loaded, you would set the `last_modified` timestamp as the cursor as follows: ```py import dlt @@ -62,10 +60,10 @@ certain range. from dlt.sources.sql_database import sql_database source = sql_database().with_resources("family") - #using the "last_modified" field as an incremental field using initial value of midnight January 1, 2024 + # Using the "last_modified" field as an incremental field using initial value of midnight January 1, 2024 source.family.apply_hints(incremental=dlt.sources.incremental("updated", initial_value=pendulum.DateTime(2022, 1, 1, 0, 0, 0))) - #running the pipeline + # Running the pipeline pipeline = dlt.pipeline(destination="duckdb") info = pipeline.run(source, write_disposition="merge") print(info) @@ -87,31 +85,31 @@ table = sql_table().parallelize() ``` ## Column reflection -Column reflection is the automatic detection and retrieval of column metadata like column names, constraints, data types etc. Columns and their data types are reflected with SQLAlchemy. The SQL types are then mapped to `dlt` types. +Column reflection is the automatic detection and retrieval of column metadata like column names, constraints, data types, etc. Columns and their data types are reflected with SQLAlchemy. The SQL types are then mapped to `dlt` types. Depending on the selected backend, some of the types might require additional processing. The `reflection_level` argument controls how much information is reflected: - `reflection_level = "minimal"`: Only column names and nullability are detected. Data types are inferred from the data. -- `reflection_level = "full"`: Column names, nullability, and data types are detected. For decimal types we always add precision and scale. **This is the default.** +- `reflection_level = "full"`: Column names, nullability, and data types are detected. For decimal types, we always add precision and scale. **This is the default.** - `reflection_level = "full_with_precision"`: Column names, nullability, data types, and precision/scale are detected, also for types like text and binary. Integer sizes are set to bigint and to int for all other types. -If the SQL type is unknown or not supported by `dlt`, then, in the pyarrow backend, the column will be skipped, whereas in the other backends the type will be inferred directly from the data irrespective of the `reflection_level` specified. In the latter case, this often means that some types are coerced to strings and `dataclass` based values from sqlalchemy are inferred as `json` (JSON in most destinations). +If the SQL type is unknown or not supported by `dlt`, then, in the pyarrow backend, the column will be skipped, whereas in the other backends the type will be inferred directly from the data irrespective of the `reflection_level` specified. In the latter case, this often means that some types are coerced to strings and `dataclass` based values from sqlalchemy are inferred as `json` (JSON in most destinations). :::tip -If you use reflection level **full** / **full_with_precision** you may encounter a situation where the data returned by sqlalchemy or pyarrow backend does not match the reflected data types. Most common symptoms are: -1. The destination complains that it cannot cast one type to another for a certain column. For example `connector-x` returns TIME in nanoseconds +If you use reflection level **full** / **full_with_precision**, you may encounter a situation where the data returned by sqlalchemy or pyarrow backend does not match the reflected data types. The most common symptoms are: +1. The destination complains that it cannot cast one type to another for a certain column. For example, `connector-x` returns TIME in nanoseconds and BigQuery sees it as bigint and fails to load. -2. You get `SchemaCorruptedException` or other coercion error during the `normalize` step. -In that case you may try **minimal** reflection level where all data types are inferred from the returned data. From our experience this prevents +2. You get `SchemaCorruptedException` or another coercion error during the `normalize` step. +In that case, you may try **minimal** reflection level where all data types are inferred from the returned data. From our experience, this prevents most of the coercion problems. ::: -You can also override the sql type by passing a `type_adapter_callback` function. This function takes a `SQLAlchemy` data type as input and returns a new type (or `None` to force the column to be inferred from the data) as output. +You can also override the SQL type by passing a `type_adapter_callback` function. This function takes a `SQLAlchemy` data type as input and returns a new type (or `None` to force the column to be inferred from the data) as output. This is useful, for example, when: -- You're loading a data type which is not supported by the destination (e.g. you need JSON type columns to be coerced to string) -- You're using a sqlalchemy dialect which uses custom types that don't inherit from standard sqlalchemy types. -- For certain types you prefer `dlt` to infer data type from the data and you return `None` +- You're loading a data type that is not supported by the destination (e.g., you need JSON type columns to be coerced to string). +- You're using a sqlalchemy dialect that uses custom types that don't inherit from standard sqlalchemy types. +- For certain types, you prefer `dlt` to infer the data type from the data and you return `None`. In the following example, when loading timestamps from Snowflake, you ensure that they get translated into standard sqlalchemy `timestamp` columns in the resultant schema: @@ -136,10 +134,11 @@ source = sql_database( dlt.pipeline("demo").run(source) ``` -## Configuring with toml/environment variables +## Configuring with TOML/environment variables + You can set most of the arguments of `sql_database()` and `sql_table()` directly in the `.toml` files and/or as environment variables. `dlt` automatically injects these values into the pipeline script. -This is particularly useful with `sql_table()` because you can maintain a separate configuration for each table (below we show **secrets.toml** and **config.toml**, you are free to combine them into one): +This is particularly useful with `sql_table()` because you can maintain a separate configuration for each table (below we show **secrets.toml** and **config.toml**; you are free to combine them into one): The examples below show how you can set arguments in any of the `.toml` files (`secrets.toml` or `config.toml`): 1. Specifying connection string: @@ -147,7 +146,7 @@ The examples below show how you can set arguments in any of the `.toml` files (` [sources.sql_database] credentials="mssql+pyodbc://loader.database.windows.net/dlt_data?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server" ``` -2. Setting parameters like backend, chunk_size, and incremental column for the table `chat_message`: +2. Setting parameters like backend, `chunk_size`, and incremental column for the table `chat_message`: ```toml [sources.sql_database.chat_message] backend="pandas" @@ -156,7 +155,7 @@ The examples below show how you can set arguments in any of the `.toml` files (` [sources.sql_database.chat_message.incremental] cursor_path="updated_at" ``` - This is especially useful with `sql_table()` in a situation where you may want to run this resource for multiple tables. Setting parameters like this would then give you a clean way of maintaing separate configurations for each table. + This is especially useful with `sql_table()` in a situation where you may want to run this resource for multiple tables. Setting parameters like this would then give you a clean way of maintaining separate configurations for each table. 3. Handling separate configurations for database and individual tables When using the `sql_database()` source, you can separately configure the parameters for the database and for the individual tables. @@ -171,13 +170,13 @@ The examples below show how you can set arguments in any of the `.toml` files (` cursor_path="updated_at" ``` - The resulting source created below will extract data using **pandas** backend with **chunk_size** 1000. The table **chat_message** will load data incrementally using **updated_at** column. All the other tables will not use incremental loading, and will instead load the full data. + The resulting source created below will extract data using the **pandas** backend with **chunk_size** 1000. The table **chat_message** will load data incrementally using the **updated_at** column. All the other tables will not use incremental loading and will instead load the full data. ```py database = sql_database() ``` -You'll be able to configure all the arguments this way (except adapter callback function). [Standard dlt rules apply]((/general-usage/credentials/setup). +You'll be able to configure all the arguments this way (except the adapter callback function). [Standard dlt rules apply](../../../general-usage/credentials/setup). It is also possible to set these arguments as environment variables [using the proper naming convention](../../../general-usage/credentials/setup#naming-convention): ```sh @@ -186,3 +185,4 @@ SOURCES__SQL_DATABASE__BACKEND=pandas SOURCES__SQL_DATABASE__CHUNK_SIZE=1000 SOURCES__SQL_DATABASE__CHAT_MESSAGE__INCREMENTAL__CURSOR_PATH=updated_at ``` + diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md index 6de2a02b31..4236d656eb 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/configuration.md @@ -10,11 +10,11 @@ import Header from '../_source-info-header.md';
-## Configuring the SQL Database source +## Configuring the SQL database source -`dlt` sources are python scripts made up of source and resource functions that can be easily customized. The SQL Database verified source has the following built-in source and resource: -1. `sql_database`: a `dlt` source which can be used to load multiple tables and views from a SQL database -2. `sql_table`: a `dlt` resource that loads a single table from the SQL database +`dlt` sources are Python scripts made up of source and resource functions that can be easily customized. The SQL Database verified source has the following built-in source and resource: +1. `sql_database`: a `dlt` source that can be used to load multiple tables and views from a SQL database. +2. `sql_table`: a `dlt` resource that loads a single table from the SQL database. Read more about sources and resources here: [General usage: source](../../../general-usage/source.md) and [General usage: resource](../../../general-usage/resource.md). @@ -106,13 +106,13 @@ We intend our sources to be fully hackable. Feel free to change the source code ### Connection string format `sql_database` uses SQLAlchemy to create database connections and reflect table schemas. You can pass credentials using -[database urls](https://docs.sqlalchemy.org/en/20/core/engines.html#database-urls), which has the general format: +[database URLs](https://docs.sqlalchemy.org/en/20/core/engines.html#database-urls), which have the general format: ```py "dialect+database_type://username:password@server:port/database_name" ``` -For example, to connect to a MySQL database using the `pymysql` dialect you can use the following connection string: +For example, to connect to a MySQL database using the `pymysql` dialect, you can use the following connection string: ```py "mysql+pymysql://rfamro:PWD@mysql-rfam-public.ebi.ac.uk:4497/Rfam" ``` @@ -123,17 +123,16 @@ Database-specific drivers can be passed into the connection string using query p "mssql+pyodbc://username:password@server/database?driver=ODBC+Driver+17+for+SQL+Server" ``` - ### Passing connection credentials to the `dlt` pipeline There are several options for adding your connection credentials into your `dlt` pipeline: -#### 1. Setting them in `secrets.toml` or as environment variables (Recommended) - -You can set up credentials using [any method](../../../general-usage/credentials/setup#available-config-providers) supported by `dlt`. We recommend using `.dlt/secrets.toml` or the environment variables. See Step 2 of the [setup](./setup) for how to set credentials inside `secrets.toml`. For more information on passing credentials read [here](../../../general-usage/credentials/setup). +#### 1. Setting them in `secrets.toml` or as environment variables (recommended) +You can set up credentials using [any method](../../../general-usage/credentials/setup#available-config-providers) supported by `dlt`. We recommend using `.dlt/secrets.toml` or the environment variables. See Step 2 of the [setup](./setup) for how to set credentials inside `secrets.toml`. For more information on passing credentials, read [here](../../../general-usage/credentials/setup). #### 2. Passing them directly in the script + It is also possible to explicitly pass credentials inside the source. Example: ```py @@ -152,8 +151,11 @@ It is recommended to configure credentials in `.dlt/secrets.toml` and to not inc ::: ### Other connection options + #### Using SqlAlchemy Engine as credentials + You are able to pass an instance of SqlAlchemy Engine instead of credentials: + ```py from dlt.sources.sql_database import sql_table from sqlalchemy import create_engine @@ -161,24 +163,20 @@ from sqlalchemy import create_engine engine = create_engine("mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam") table = sql_table(engine, table="chat_message", schema="data") ``` -This engine is used by `dlt` to open database connections and can work across multiple threads so is compatible with `parallelize` setting of dlt sources and resources. +This engine is used by `dlt` to open database connections and can work across multiple threads, so it is compatible with the `parallelize` setting of dlt sources and resources. ## Configuring the backend -Table backends convert streams of rows from database tables into batches in various formats. The default backend `SQLAlchemy` follows standard `dlt` behavior of -extracting and normalizing Python dictionaries. We recommend this for smaller tables, initial development work, and when minimal dependencies or a pure Python environment is required. This backend is also the slowest. Other backends make use of the structured data format of the tables and provide significant improvement in speeds. For example, the `PyArrow` backend converts rows into `Arrow` tables, which results in -good performance and preserves exact data types. We recommend using this backend for larger tables. +Table backends convert streams of rows from database tables into batches in various formats. The default backend, `SQLAlchemy`, follows standard `dlt` behavior of extracting and normalizing Python dictionaries. We recommend this for smaller tables, initial development work, and when minimal dependencies or a pure Python environment is required. This backend is also the slowest. Other backends make use of the structured data format of the tables and provide significant improvement in speeds. For example, the `PyArrow` backend converts rows into `Arrow` tables, which results in good performance and preserves exact data types. We recommend using this backend for larger tables. ### SQLAlchemy -The `SQLAlchemy` backend (the default) yields table data as a list of Python dictionaries. This data goes through the regular extract -and normalize steps and does not require additional dependencies to be installed. It is the most robust (works with any destination, correctly represents data types) but also the slowest. You can set `reflection_level="full_with_precision"` to pass exact data types to `dlt` schema. +The `SQLAlchemy` backend (the default) yields table data as a list of Python dictionaries. This data goes through the regular extract and normalize steps and does not require additional dependencies to be installed. It is the most robust (works with any destination, correctly represents data types) but also the slowest. You can set `reflection_level="full_with precision"` to pass exact data types to the `dlt` schema. ### PyArrow -The `PyArrow` backend yields data as `Arrow` tables. It uses `SQLAlchemy` to read rows in batches but then immediately converts them into `ndarray`, transposes it, and sets it as columns in an `Arrow` table. This backend always fully -reflects the database table and preserves original types (i.e. **decimal** / **numeric** data will be extracted without loss of precision). If the destination loads parquet files, this backend will skip `dlt` normalizer and you can gain two orders of magnitude (20x - 30x) speed increase. +The `PyArrow` backend yields data as `Arrow` tables. It uses `SQLAlchemy` to read rows in batches but then immediately converts them into `ndarray`, transposes it, and sets it as columns in an `Arrow` table. This backend always fully reflects the database table and preserves original types (i.e., **decimal** / **numeric** data will be extracted without loss of precision). If the destination loads parquet files, this backend will skip the `dlt` normalizer, and you can gain two orders of magnitude (20x - 30x) speed increase. Note that if `pandas` is installed, we'll use it to convert `SQLAlchemy` tuples into `ndarray` as it seems to be 20-30% faster than using `numpy` directly. @@ -207,21 +205,20 @@ info = pipeline.run(sql_alchemy_source) print(info) ``` -### pandas +### Pandas The `pandas` backend yields data as DataFrames using the `pandas.io.sql` module. `dlt` uses `PyArrow` dtypes by default as they generate more stable typing. With the default settings, several data types will be coerced to dtypes in the yielded data frame: -* **decimal** is mapped to double so it is possible to lose precision +* **decimal** is mapped to double, so it is possible to lose precision * **date** and **time** are mapped to strings * all types are nullable :::note -`dlt` will still use the data types reflected from the source database when creating destination tables. How the type differences resulting from the `pandas` backend are reconciled / parsed is up to the destination. Most of the destinations will be able to parse date/time strings and convert doubles into decimals (Please note that you'll still lose precision on decimals with default settings.). **However we strongly suggest -not to use the** `pandas` **backend if your source tables contain date, time, or decimal columns** +`dlt` will still use the data types reflected from the source database when creating destination tables. How the type differences resulting from the `pandas` backend are reconciled/parsed is up to the destination. Most of the destinations will be able to parse date/time strings and convert doubles into decimals (Please note that you'll still lose precision on decimals with default settings.). **However, we strongly suggest not to use the** `pandas` **backend if your source tables contain date, time, or decimal columns.** ::: -Internally dlt uses `pandas.io.sql._wrap_result` to generate `pandas` frames. To adjust [pandas-specific settings,](https://pandas.pydata.org/docs/reference/api/pandas.read_sql_table.html) pass it in the `backend_kwargs` parameter. For example, below we set `coerce_float` to `False`: +Internally, `dlt` uses `pandas.io.sql._wrap_result` to generate `pandas` frames. To adjust [pandas-specific settings,](https://pandas.pydata.org/docs/reference/api/pandas.read_sql_table.html) pass it in the `backend_kwargs` parameter. For example, below we set `coerce_float` to `False`: ```py import dlt @@ -252,22 +249,22 @@ print(info) ``` ### ConnectorX -The [`ConnectorX`](https://sfu-db.github.io/connector-x/intro.html) backend completely skips `SQLALchemy` when reading table rows, in favor of doing that in rust. This is claimed to be significantly faster than any other method (validated only on postgres). With the default settings it will emit `PyArrow` tables, but you can configure this by specifying the `return_type` in `backend_kwargs`. (See the [`ConnectorX` docs](https://sfu-db.github.io/connector-x/api.html) for a full list of configurable parameters.) +The [`ConnectorX`](https://sfu-db.github.io/connector-x/intro.html) backend completely skips `SQLALchemy` when reading table rows, in favor of doing that in Rust. This is claimed to be significantly faster than any other method (validated only on PostgreSQL). With the default settings, it will emit `PyArrow` tables, but you can configure this by specifying the `return_type` in `backend_kwargs`. (See the [`ConnectorX` docs](https://sfu-db.github.io/connector-x/api.html) for a full list of configurable parameters.) There are certain limitations when using this backend: -* it will ignore `chunk_size`. `ConnectorX` cannot yield data in batches. -* in many cases it requires a connection string that differs from the `SQLAlchemy` connection string. Use the `conn` argument in `backend_kwargs` to set this. -* it will convert **decimals** to **doubles**, so you will lose precision. -* nullability of the columns is ignored (always true) -* it uses different mappings for each data type. (Check [here](https://sfu-db.github.io/connector-x/databases.html) for more details.) -* JSON fields (at least those coming from postgres) are double wrapped in strings. To unwrap this, you can pass the in-built transformation function `unwrap_json_connector_x` (for example, with `add_map`): +* It will ignore `chunk_size`. `ConnectorX` cannot yield data in batches. +* In many cases, it requires a connection string that differs from the `SQLAlchemy` connection string. Use the `conn` argument in `backend_kwargs` to set this. +* It will convert **decimals** to **doubles**, so you will lose precision. +* Nullability of the columns is ignored (always true). +* It uses different mappings for each data type. (Check [here](https://sfu-db.github.io/connector-x/databases.html) for more details.) +* JSON fields (at least those coming from PostgreSQL) are double-wrapped in strings. To unwrap this, you can pass the in-built transformation function `unwrap_json_connector_x` (for example, with `add_map`): ```py from dlt.sources.sql_database.helpers import unwrap_json_connector_x ``` :::note -`dlt` will still use the data types refected from the source database when creating destination tables. It is up to the destination to reconcile / parse type differences. Please note that you'll still lose precision on decimals with default settings. +`dlt` will still use the data types reflected from the source database when creating destination tables. It is up to the destination to reconcile/parse type differences. Please note that you'll still lose precision on decimals with default settings. ::: ```py @@ -286,7 +283,7 @@ unsw_table = sql_table( backend="connectorx", # keep source data types reflection_level="full_with_precision", - # just to demonstrate how to setup a separate connection string for connectorx + # just to demonstrate how to set up a separate connection string for connectorx backend_kwargs={"conn": "postgresql://loader:loader@localhost:5432/dlt_data"} ) @@ -305,4 +302,5 @@ info = pipeline.run( ) print(info) ``` -With the dataset above and a local postgres instance, the `ConnectorX` backend is 2x faster than the `PyArrow` backend. +With the dataset above and a local PostgreSQL instance, the `ConnectorX` backend is 2x faster than the `PyArrow` backend. + diff --git a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md index d9aae62f94..65c937ef77 100644 --- a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md +++ b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md @@ -31,7 +31,7 @@ pipeline and hide many intricacies of correctly setting up the connection to you ### Querying the data using the `dlt` SQL client Execute any SQL query and get results following the Python -[dbapi](https://peps.python.org/pep-0249/) spec. Below we fetch data from the customers table: +[dbapi](https://peps.python.org/pep-0249/) spec. Below, we fetch data from the customers table: ```py pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") @@ -40,17 +40,17 @@ with pipeline.sql_client() as client: "SELECT id, name, email FROM customers WHERE id = %s", 10 ) as cursor: - # get all data from the cursor as list of rows + # get all data from the cursor as a list of rows print(cursor.fetchall()) ``` -In the above, we used `dbapi` parameters placeholders and fetched the data using `fetchall` method +In the above, we used `dbapi` parameter placeholders and fetched the data using the `fetchall` method that reads all the rows from the cursor. ### Querying data into a data frame -You can fetch results of any SQL query as a data frame. If the destination is supporting that -natively (i.e. BigQuery and DuckDB), `dlt` uses the native method. Thanks to that, reading data +You can fetch the results of any SQL query as a data frame. If the destination supports that +natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to that, reading data frames may be really fast! The example below reads GitHub reactions data from the `issues` table and counts reaction types. @@ -65,18 +65,18 @@ with pipeline.sql_client() as client: with client.execute_query( 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues' ) as table: - # calling `df` on a cursor, returns the data as a DataFrame + # calling `df` on a cursor returns the data as a DataFrame reactions = table.df() counts = reactions.sum(0).sort_values(0, ascending=False) ``` -The `df` method above returns all the data in the cursor as data frame. You can also fetch data in -chunks by passing `chunk_size` argument to the `df` method. +The `df` method above returns all the data in the cursor as a data frame. You can also fetch data in +chunks by passing the `chunk_size` argument to the `df` method. ### Access destination native connection The native connection to your destination like BigQuery `Client` or DuckDB `DuckDBPyConnection` is -available in case you want to do anything special. Below we take the native connection to `duckdb` +available in case you want to do anything special. Below, we take the native connection to `duckdb` to get `DuckDBPyRelation` from a query: ```py @@ -90,7 +90,7 @@ with pipeline.sql_client() as client: rel.limit(3).show() ``` -## Data Quality Dashboards +## Data quality dashboards After deploying a `dlt` pipeline, you might ask yourself: How can we know if the data is and remains high quality? @@ -108,38 +108,21 @@ any gaps or loading issues. ### Data usage as monitoring -Setting up monitoring is a good idea. However, in practice, often by the time you notice something -is wrong through reviewing charts, someone in the business has likely already noticed something is -wrong. That is, if there is usage of the data, then that usage will act as sort of monitoring. +Setting up monitoring is a good idea. However, in practice, often by the time you notice something is wrong through reviewing charts, someone in the business has likely already noticed something is wrong. That is, if there is usage of the data, then that usage will act as a sort of monitoring. -### Plotting main metrics on the line charts +### Plotting main metrics on line charts -In cases where data is not being used that much (e.g. only one marketing analyst is using some data -alone), then it is a good idea to have them plot their main metrics on "last 7 days" line charts, so -it's visible to them that something may be off when they check their metrics. +In cases where data is not being used much (e.g., only one marketing analyst is using some data alone), then it is a good idea to have them plot their main metrics on "last 7 days" line charts, so it's visible to them that something may be off when they check their metrics. -It's important to think about granularity here. A daily line chart, for example, would not catch -hourly issues well. Typically, you will want to match the granularity of the time dimension -(day/hour/etc.) of the line chart with the things that could go wrong, either in the loading process -or in the tracked process. +It's important to think about granularity here. A daily line chart, for example, would not catch hourly issues well. Typically, you will want to match the granularity of the time dimension (day/hour/etc.) of the line chart with the things that could go wrong, either in the loading process or in the tracked process. -If a dashboard is the main product of an analyst, they will generally watch it closely. Therefore, -it's probably not necessary for a data engineer to include monitoring in their daily activities in -these situations. +If a dashboard is the main product of an analyst, they will generally watch it closely. Therefore, it's probably not necessary for a data engineer to include monitoring in their daily activities in these situations. ## Tools to create dashboards -[Metabase](https://www.metabase.com/), [Looker Studio](https://lookerstudio.google.com/u/0/), and -[Streamlit](https://streamlit.io/) are some common tools that you might use to set up dashboards to -explore data. It's worth noting that while many tools are suitable for exploration, different tools -enable your organization to achieve different things. Some organizations use multiple tools for -different scopes: - -- Tools like [Metabase](https://www.metabase.com/) are intended for data democratization, where the - business user can change the dimension or granularity to answer follow-up questions. -- Tools like [Looker Studio](https://lookerstudio.google.com/u/0/) and - [Tableau](https://www.tableau.com/) are intended for minimal interaction curated dashboards that - business users can filter and read as-is with limited training. -- Tools like [Streamlit](https://streamlit.io/) enable powerful customizations and the building of - complex apps by Python-first developers, but they generally do not support self-service out of the - box. +[Metabase](https://www.metabase.com/), [Looker Studio](https://lookerstudio.google.com/u/0/), and [Streamlit](https://streamlit.io/) are some common tools that you might use to set up dashboards to explore data. It's worth noting that while many tools are suitable for exploration, different tools enable your organization to achieve different things. Some organizations use multiple tools for different scopes: + +- Tools like [Metabase](https://www.metabase.com/) are intended for data democratization, where the business user can change the dimension or granularity to answer follow-up questions. +- Tools like [Looker Studio](https://lookerstudio.google.com/u/0/) and [Tableau](https://www.tableau.com/) are intended for minimal interaction curated dashboards that business users can filter and read as-is with limited training. +- Tools like [Streamlit](https://streamlit.io/) enable powerful customizations and the building of complex apps by Python-first developers, but they generally do not support self-service out of the box. + diff --git a/docs/website/docs/examples/index.md b/docs/website/docs/examples/index.md index 5be3fd1632..b0b16e274d 100644 --- a/docs/website/docs/examples/index.md +++ b/docs/website/docs/examples/index.md @@ -1,14 +1,15 @@ --- title: Code Examples -description: A list of comprehensive code examples that teach you how to solve real world problem. +description: A list of comprehensive code examples that teach you how to solve real world problems. keywords: ['examples'] --- import DocCardList from '@theme/DocCardList'; -A list of comprehensive code examples that teach you how to solve a real world problem. +A list of comprehensive code examples that teach you how to solve real-world problems. :::info If you want to share your example, follow this [contributing](https://github.com/dlt-hub/dlt/tree/devel/docs/examples/CONTRIBUTING.md) tutorial. ::: - \ No newline at end of file + + diff --git a/docs/website/docs/reference/command-line-interface.md b/docs/website/docs/reference/command-line-interface.md index 14fadba74d..e29b43bcba 100644 --- a/docs/website/docs/reference/command-line-interface.md +++ b/docs/website/docs/reference/command-line-interface.md @@ -9,37 +9,37 @@ keywords: [command line interface, cli, dlt init] ```sh dlt init ``` -This command creates new dlt pipeline script that loads data from `source` to `destination` to it. When you run the command: -1. It creates basic project structure if the current folder is empty. Adds `.dlt/config.toml` and `.dlt/secrets.toml` and `.gitignore` files. -2. It checks if `source` argument is matching one of our [verified sources](../dlt-ecosystem/verified-sources/) and if it is so, [it adds it to the project](../walkthroughs/add-a-verified-source.md). -3. If the `source` is unknown it will use a [generic template](https://github.com/dlt-hub/python-dlt-init-template) to [get you started](../walkthroughs/create-a-pipeline.md). +This command creates a new dlt pipeline script that loads data from `source` to `destination`. When you run the command: +1. It creates a basic project structure if the current folder is empty, adding `.dlt/config.toml`, `.dlt/secrets.toml`, and `.gitignore` files. +2. It checks if the `source` argument matches one of our [verified sources](../dlt-ecosystem/verified-sources/) and, if so, [adds it to the project](../walkthroughs/add-a-verified-source.md). +3. If the `source` is unknown, it will use a [generic template](https://github.com/dlt-hub/python-dlt-init-template) to [get you started](../walkthroughs/create-a-pipeline.md). 4. It will rewrite the pipeline scripts to use your `destination`. 5. It will create sample config and credentials in `secrets.toml` and `config.toml` for the specified source and destination. -6. It will create `requirements.txt` with dependencies required by source and destination. If one exists, it will print instructions what to add to it. +6. It will create `requirements.txt` with dependencies required by the source and destination. If one exists, it will print instructions on what to add to it. -This command can be used several times in the same folders to add more sources, destinations and pipelines. It will also update the verified source code to the newest -version if run again with existing `source` name. You are warned if files will be overwritten or if `dlt` version needs upgrade to run particular pipeline. +This command can be used several times in the same folder to add more sources, destinations, and pipelines. It will also update the verified source code to the newest +version if run again with an existing `source` name. You are warned if files will be overwritten or if the `dlt` version needs an upgrade to run a particular pipeline. -### Specify your own "verified sources" repository. -You can use `--location ` option to specify your own repository with sources. Typically you would [fork ours](https://github.com/dlt-hub/verified-sources) and start customizing and adding sources ie. to use them for your team or organization. You can also specify a branch with `--branch ` ie. to test a version being developed. +### Specify your own "verified sources" repository +You can use the `--location ` option to specify your own repository with sources. Typically, you would [fork ours](https://github.com/dlt-hub/verified-sources) and start customizing and adding sources, e.g., to use them for your team or organization. You can also specify a branch with `--branch `, e.g., to test a version being developed. ### List all sources ```sh dlt init --list-sources ``` -Shows all available verified sources and their short descriptions. For each source, checks if your local `dlt` version requires update +Shows all available verified sources and their short descriptions. For each source, it checks if your local `dlt` version requires an update and prints the relevant warning. ## `dlt deploy` -This command prepares your pipeline for deployment and gives you step by step instruction how to accomplish it. To enabled this functionality please first execute +This command prepares your pipeline for deployment and gives you step-by-step instructions on how to accomplish it. To enable this functionality, please first execute ```sh pip install "dlt[cli]" ``` -that will add additional packages to current environment. +that will add additional packages to the current environment. > 💡 We ask you to install those dependencies separately to keep our core library small and make it work everywhere. -### github-action +### `github-action` ```sh dlt deploy