From bf92adaaa56e99d4a9a591789bf3763d1f321545 Mon Sep 17 00:00:00 2001 From: mucio Date: Thu, 23 May 2024 10:27:18 +0200 Subject: [PATCH 01/15] Added values to the data pattern of the rest_api helper (#1399) --- dlt/sources/helpers/rest_client/detector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dlt/sources/helpers/rest_client/detector.py b/dlt/sources/helpers/rest_client/detector.py index 857f6bbb4e..d004ca173c 100644 --- a/dlt/sources/helpers/rest_client/detector.py +++ b/dlt/sources/helpers/rest_client/detector.py @@ -25,6 +25,7 @@ "payload", "content", "objects", + "values", ] ) From bab9e90306446e6ae4238b8753bab59b19c87955 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Thu, 23 May 2024 14:47:08 +0530 Subject: [PATCH 02/15] Added info about how to reorder the columns to adjust a schema (#1364) * Added info about how to reorder the columns * Updated rest_api.md with configuration examples * Update docs/website/docs/walkthroughs/adjust-a-schema.md * Updated ../website/docs/dlt-ecosystem/verified-sources/rest_api.md * fix naming convention for bigquery custom destination --------- Co-authored-by: Anton Burnashev Co-authored-by: AstrakhantsevaAA --- .../custom_destination_bigquery.py | 20 +++++++------- .../docs/walkthroughs/adjust-a-schema.md | 26 +++++++++++++++++++ 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py index ea60b9b00d..e890469263 100644 --- a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -5,13 +5,13 @@ keywords: [destination, credentials, example, bigquery, custom destination] --- -In this example, you'll find a Python script that demonstrates how to load to bigquey with the custom destination. +In this example, you'll find a Python script that demonstrates how to load to BigQuery with the custom destination. We'll learn how to: -- use [built-in credentials](../general-usage/credentials/config_specs#gcp-credentials) -- use the [custom destination](../dlt-ecosystem/destinations/destination.md) -- Use pyarrow tables to create complex column types on bigquery -- Use bigquery `autodetect=True` for schema inference from parquet files +- Use [built-in credentials.](../general-usage/credentials/config_specs#gcp-credentials) +- Use the [custom destination.](../dlt-ecosystem/destinations/destination.md) +- Use pyarrow tables to create complex column types on BigQuery. +- Use BigQuery `autodetect=True` for schema inference from parquet files. """ @@ -38,7 +38,7 @@ def resource(url: str): # load pyarrow table with pandas table = pa.Table.from_pandas(pd.read_csv(url)) - # we add a list type column to demontrate bigquery lists + # we add a list type column to demonstrate bigquery lists table = table.append_column( "tags", pa.array( @@ -57,12 +57,12 @@ def resource(url: str): yield table -# dlt biquery custom destination +# dlt bigquery custom destination # we can use the dlt provided credentials class # to retrieve the gcp credentials from the secrets -@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0) +@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case") def bigquery_insert( - items, table, credentials: GcpServiceAccountCredentials = dlt.secrets.value + items, table=BIGQUERY_TABLE_ID, credentials: GcpServiceAccountCredentials = dlt.secrets.value ) -> None: client = bigquery.Client( credentials.project_id, credentials.to_native_credentials(), location="US" @@ -74,7 +74,7 @@ def bigquery_insert( ) # since we have set the batch_size to 0, we get a filepath and can load the file directly with open(items, "rb") as f: - load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config) + load_job = client.load_table_from_file(f, table, job_config=job_config) load_job.result() # Waits for the job to complete. diff --git a/docs/website/docs/walkthroughs/adjust-a-schema.md b/docs/website/docs/walkthroughs/adjust-a-schema.md index cfe2d056b0..b0a9a9ce05 100644 --- a/docs/website/docs/walkthroughs/adjust-a-schema.md +++ b/docs/website/docs/walkthroughs/adjust-a-schema.md @@ -121,6 +121,32 @@ Do not rename the tables or columns in the yaml file. `dlt` infers those from th You can [adjust the schema](../general-usage/resource.md#adjust-schema) in Python before resource is loaded. ::: +### Reorder columns +To reorder the columns in your dataset, follow these steps: + +1. Initial Run: Execute the pipeline to obtain the import and export schemas. +1. Modify Export Schema: Adjust the column order as desired in the export schema. +1. Sync Import Schema: Ensure that these changes are mirrored in the import schema to maintain consistency. +1. Delete Dataset: Remove the existing dataset to prepare for the reload. +1. Reload Data: Reload the data. The dataset should now reflect the new column order as specified in the import YAML. + +These steps ensure that the column order in your dataset matches your specifications. + +**Another approach** to reorder columns is to use the `add_map` function. For instance, to rearrange ‘column1’, ‘column2’, and ‘column3’, you can proceed as follows: + +```py +# Define the data source and reorder columns using add_map +data_source = resource().add_map(lambda row: { + 'column3': row['column3'], + 'column1': row['column1'], + 'column2': row['column2'] +}) + +# Run the pipeline +load_info = pipeline.run(data_source) +``` + +In this example, the `add_map` function reorders columns by defining a new mapping. The lambda function specifies the desired order by rearranging the key-value pairs. When the pipeline runs, the data will load with the columns in the new order. ### Load data as json instead of generating child table or columns from flattened dicts From e44984801a60beed73d32d9739b2cd5ae00cb403 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 23 May 2024 11:49:49 +0200 Subject: [PATCH 03/15] rest_api: add response_actions documentation (#1362) --- .../verified-sources/rest_api.md | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 0022850987..d5d29344de 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -282,7 +282,7 @@ The fields in the endpoint configuration are: - `json`: The JSON payload to be sent with the request (for POST and PUT requests). - `paginator`: Pagination configuration for the endpoint. See the [pagination](#pagination) section for more details. - `data_selector`: A JSONPath to select the data from the response. See the [data selection](#data-selection) section for more details. -- `response_actions`: A list of actions that define how to process the response data. +- `response_actions`: A list of actions that define how to process the response data. See the [response actions](#response-actions) section for more details. - `incremental`: Configuration for [incremental loading](#incremental-loading). ### Pagination @@ -586,3 +586,33 @@ See the [incremental loading](../../general-usage/incremental-loading.md#increme - `root_key` (bool): Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge. Defaults to False. - `schema_contract`: Schema contract settings that will be applied to this resource. - `spec`: A specification of configuration and secret values required by the source. + +### Response actions + +The `response_actions` field in the endpoint configuration allows you to specify how to handle specific responses from the API based on status codes or content substrings. This is useful for handling edge cases like ignoring responses on specific conditions. + +:::caution Experimental Feature +This is an experimental feature and may change in future releases. +::: + +#### Example + +```py +{ + "path": "issues", + "response_actions": [ + {"status_code": 404, "action": "ignore"}, + {"content": "Not found", "action": "ignore"}, + {"status_code": 200, "content": "some text", "action": "ignore"}, + ], +} +``` + +In this example, the source will ignore responses with a status code of 404, responses with the content "Not found", and responses with a status code of 200 _and_ content "some text". + +**Fields:** + +- `status_code` (int, optional): The HTTP status code to match. +- `content` (str, optional): A substring to search for in the response content. +- `action` (str): The action to take when the condition is met. Currently supported actions: + - `ignore`: Ignore the response. From 19e1462e65995291be39081dfcbc0cd0dd588b6f Mon Sep 17 00:00:00 2001 From: rudolfix Date: Thu, 23 May 2024 17:04:25 +0200 Subject: [PATCH 04/15] detects a path param in the right-most path segment (#1394) --- dlt/sources/helpers/rest_client/detector.py | 8 ++++++-- .../helpers/rest_client/test_detector.py | 17 ++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/dlt/sources/helpers/rest_client/detector.py b/dlt/sources/helpers/rest_client/detector.py index d004ca173c..19a1e83a82 100644 --- a/dlt/sources/helpers/rest_client/detector.py +++ b/dlt/sources/helpers/rest_client/detector.py @@ -1,5 +1,6 @@ import re -from typing import List, Dict, Any, Tuple, Union, Optional, Callable, Iterable +from pathlib import PurePosixPath +from typing import List, Dict, Any, Tuple, Union, Callable, Iterable from urllib.parse import urlparse from requests import Response @@ -47,7 +48,10 @@ def single_entity_path(path: str) -> bool: """Checks if path ends with path param indicating that single object is returned""" - return re.search(r"\{([a-zA-Z_][a-zA-Z0-9_]*)\}/?$", path) is not None + # get last path segment + name = PurePosixPath(path).name + # alphabet for a name taken from https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.3.md#fixed-fields-6 + return re.search(r"\{([a-zA-Z0-9\.\-_]+)\}", name) is not None def matches_any_pattern(key: str, patterns: Iterable[str]) -> bool: diff --git a/tests/sources/helpers/rest_client/test_detector.py b/tests/sources/helpers/rest_client/test_detector.py index f01f9409a1..6511b472fb 100644 --- a/tests/sources/helpers/rest_client/test_detector.py +++ b/tests/sources/helpers/rest_client/test_detector.py @@ -406,16 +406,20 @@ def test_find_paginator(test_case) -> None: [ "/users/{user_id}", "/api/v1/products/{product_id}/", - # those are not valid paths - # "/api/v1/products/{product_id}//", - # "/api/v1/products/{product_id}?param1=value1", - # "/api/v1/products/{product_id}#section", - # "/api/v1/products/{product_id}/#section", + "/api/v1/products/{product_id}//", + "/api/v1/products/{product_id}?param1=value1", + "/api/v1/products/{product_id}#section", + "/api/v1/products/{product_id}.json", + "/api/v1/products/{product_id}.json/", + "/api/v1/products/{product_id}_data", + "/api/v1/products/{product_id}_data?param=true", "/users/{user_id}/posts/{post_id}", "/users/{user_id}/posts/{post_id}/comments/{comment_id}", "{entity}", "/{entity}", "/{user_123}", + "/users/{user-id}", + "/users/{123}", ], ) def test_single_entity_path_valid(path): @@ -430,8 +434,7 @@ def test_single_entity_path_valid(path): "/users/{user_id}/details", "/", "/{}", - "/users/{123}", - "/users/{user-id}", + "/api/v1/products/{product_id}/#section", "/users/{user id}", "/users/{user_id}/{", # Invalid ending ], From d0fdfb4ca000ca57c56134bd0fb9b9f11e90b286 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 23 May 2024 18:50:26 +0200 Subject: [PATCH 05/15] Update the tutorial to use `rest_client.paginate` for pagination (#1287) --- .../docs/tutorial/grouping-resources.md | 141 ++++++------------ .../docs/tutorial/load-data-from-an-api.md | 51 ++++++- 2 files changed, 93 insertions(+), 99 deletions(-) diff --git a/docs/website/docs/tutorial/grouping-resources.md b/docs/website/docs/tutorial/grouping-resources.md index 3a05f7940c..3ba95b7971 100644 --- a/docs/website/docs/tutorial/grouping-resources.md +++ b/docs/website/docs/tutorial/grouping-resources.md @@ -14,6 +14,9 @@ This tutorial continues the [previous](load-data-from-an-api) part. We'll use th In the previous tutorial, we loaded issues from the GitHub API. Now we'll prepare to load comments from the API as well. Here's a sample [dlt resource](../general-usage/resource) that does that: ```py +import dlt +from dlt.sources.helpers.rest_client import paginate + @dlt.resource( table_name="comments", write_disposition="merge", @@ -22,17 +25,11 @@ In the previous tutorial, we loaded issues from the GitHub API. Now we'll prepar def get_comments( updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - url = "https://api.github.com/repos/dlt-hub/dlt/comments?per_page=100" - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/comments", + params={"per_page": 100} + ): + yield page ``` We can load this resource separately from the issues resource, however loading both issues and comments in one go is more efficient. To do that, we'll use the `@dlt.source` decorator on a function that returns a list of resources: @@ -47,7 +44,7 @@ def github_source(): ```py import dlt -from dlt.sources.helpers import requests +from dlt.sources.helpers.rest_client import paginate @dlt.resource( table_name="issues", @@ -57,21 +54,17 @@ from dlt.sources.helpers import requests def get_issues( updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - url = ( - "https://api.github.com/repos/dlt-hub/dlt/issues" - f"?since={updated_at.last_value}&per_page=100" - "&sort=updated&directions=desc&state=open" - ) - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/issues", + params={ + "since": updated_at.last_value, + "per_page": 100, + "sort": "updated", + "directions": "desc", + "state": "open", + } + ): + yield page @dlt.resource( @@ -82,20 +75,14 @@ def get_issues( def get_comments( updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): - url = ( - "https://api.github.com/repos/dlt-hub/dlt/comments" - "?per_page=100" - ) - - while True: - response = requests.get(url) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/comments", + params={ + "since": updated_at.last_value, + "per_page": 100, + } + ): + yield page @dlt.source @@ -124,18 +111,8 @@ from dlt.sources.helpers import requests BASE_GITHUB_URL = "https://api.github.com/repos/dlt-hub/dlt" def fetch_github_data(endpoint, params={}): - """Fetch data from GitHub API based on endpoint and params.""" url = f"{BASE_GITHUB_URL}/{endpoint}" - - while True: - response = requests.get(url, params=params) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + return paginate(url, params=params) @dlt.source def github_source(): @@ -164,21 +141,16 @@ For the next step we'd want to get the [number of repository clones](https://doc Let's handle this by changing our `fetch_github_data()` first: ```py -def fetch_github_data(endpoint, params={}, access_token=None): - """Fetch data from GitHub API based on endpoint and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} +from dlt.sources.helpers.rest_client.auth import BearerTokenAuth +def fetch_github_data(endpoint, params={}, access_token=None): url = f"{BASE_GITHUB_URL}/{endpoint}" + return paginate( + url, + params=params, + auth=BearerTokenAuth(token=access_token) if access_token else None, + ) - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] @dlt.source def github_source(access_token): @@ -229,28 +201,7 @@ access_token = "ghp_A...3aRY" Now we can run the script and it will load the data from the `traffic/clones` endpoint: ```py -import dlt -from dlt.sources.helpers import requests - -BASE_GITHUB_URL = "https://api.github.com/repos/dlt-hub/dlt" - - -def fetch_github_data(endpoint, params={}, access_token=None): - """Fetch data from GitHub API based on endpoint and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} - - url = f"{BASE_GITHUB_URL}/{endpoint}" - - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] - +... @dlt.source def github_source( @@ -287,19 +238,12 @@ BASE_GITHUB_URL = "https://api.github.com/repos/{repo_name}" def fetch_github_data(repo_name, endpoint, params={}, access_token=None): """Fetch data from GitHub API based on repo_name, endpoint, and params.""" - headers = {"Authorization": f"Bearer {access_token}"} if access_token else {} - url = BASE_GITHUB_URL.format(repo_name=repo_name) + f"/{endpoint}" - - while True: - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - yield response.json() - - # Get next page - if "next" not in response.links: - break - url = response.links["next"]["url"] + return paginate( + url, + params=params, + auth=BearerTokenAuth(token=access_token) if access_token else None, + ) @dlt.source @@ -347,5 +291,6 @@ Interested in learning more? Here are some suggestions: - [Pass config and credentials into your sources and resources](../general-usage/credentials). - [Run in production: inspecting, tracing, retry policies and cleaning up](../running-in-production/running). - [Run resources in parallel, optimize buffers and local storage](../reference/performance.md) + - [Use REST API client helpers](../general-usage/http/rest-client.md) to simplify working with REST APIs. 3. Check out our [how-to guides](../walkthroughs) to get answers to some common questions. 4. Explore the [Examples](../examples) section to see how dlt can be used in real-world scenarios diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 31a2c1592d..ec6136b6d3 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -44,7 +44,7 @@ dlt pipeline github_issues show ## Append or replace your data -Try running the pipeline again with `python github_issues.py`. You will notice that the **issues** table contains two copies of the same data. This happens because the default load mode is `append`. It is very useful, for example, when you have a new folder created daily with `json` file logs, and you want to ingest them. +Try running the pipeline again with `python github_issues.py`. You will notice that the **issues** table contains two copies of the same data. This happens because the default load mode is `append`. It is very useful, for example, when you have daily data updates and you want to ingest them. To get the latest data, we'd need to run the script again. But how to do that without duplicating the data? One option is to tell `dlt` to replace the data in existing tables in the destination by using `replace` write disposition. Change the `github_issues.py` script to the following: @@ -148,6 +148,55 @@ and `updated_at.last_value` to tell GitHub to return issues updated only **after [Learn more about merge write disposition](../general-usage/incremental-loading#merge-incremental_loading). +## Using pagination helper + +In the previous examples, we used the `requests` library to make HTTP requests to the GitHub API and handled pagination manually. `dlt` has the built-in [REST client](../general-usage/http/rest-client.md) that simplifies API requests. We'll pick the `paginate()` helper from it for the next example. The `paginate` function takes a URL and optional parameters (quite similar to `requests`) and returns a generator that yields pages of data. + +Here's how the updated script looks: + +```py +import dlt +from dlt.sources.helpers.rest_client import paginate + +@dlt.resource( + table_name="issues", + write_disposition="merge", + primary_key="id", +) +def get_issues( + updated_at=dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") +): + for page in paginate( + "https://api.github.com/repos/dlt-hub/dlt/issues", + params={ + "since": updated_at.last_value, + "per_page": 100, + "sort": "updated", + "direction": "desc", + "state": "open", + }, + ): + yield page + +pipeline = dlt.pipeline( + pipeline_name="github_issues_merge", + destination="duckdb", + dataset_name="github_data_merge", +) +load_info = pipeline.run(get_issues) +row_counts = pipeline.last_trace.last_normalize_info + +print(row_counts) +print("------") +print(load_info) +``` + +Let's zoom in on the changes: + +1. The `while` loop that handled pagination is replaced with reading pages from the `paginate()` generator. +2. `paginate()` takes the URL of the API endpoint and optional parameters. In this case, we pass the `since` parameter to get only issues updated after the last pipeline run. +3. We're not explicitly setting up pagination, `paginate()` handles it for us. Magic! Under the hood, `paginate()` analyzes the response and detects the pagination method used by the API. Read more about pagination in the [REST client documentation](../general-usage/http/rest-client.md#paginating-api-responses). + ## Next steps Continue your journey with the [Resource Grouping and Secrets](grouping-resources) tutorial. From a8a6ff7288f8639bea9e92c20573126e538215e1 Mon Sep 17 00:00:00 2001 From: Harato Daisuke <129731743+Benjamin0313@users.noreply.github.com> Date: Fri, 24 May 2024 17:51:48 +0900 Subject: [PATCH 06/15] fix command to install dlt (#1404) --- docs/website/docs/dlt-ecosystem/destinations/snowflake.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index f144da02e6..deaaff3562 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -9,7 +9,7 @@ keywords: [Snowflake, destination, data warehouse] ## Install `dlt` with Snowflake **To install the `dlt` library with Snowflake dependencies, run:** ```sh -pip install dlt[snowflake] +pip install "dlt[snowflake]" ``` ## Setup Guide From f6f583c65ef32dfa6bc4741a0db4e5e0a2b096b3 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Fri, 24 May 2024 11:28:06 +0200 Subject: [PATCH 07/15] Update rest_api.md fix auth methods table --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index d5d29344de..54edac5062 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -414,8 +414,8 @@ Available authentication types: | Authentication class | String Alias (`type`) | Description | | ------------------- | ----------- | ----------- | | [BearTokenAuth](../../general-usage/http/rest-client.md#bearer-token-authentication) | `bearer` | Bearer token authentication. | -| [HTTPBasicAuth](../../general-usage/http/rest-client.md#http-basic-authentication) | `api_key` | Basic HTTP authentication. | -| [APIKeyAuth](../../general-usage/http/rest-client.md#api-key-authentication) | `http_basic` | API key authentication with key defined in the query parameters or in the headers. | +| [HTTPBasicAuth](../../general-usage/http/rest-client.md#http-basic-authentication) | `http_basic` | Basic HTTP authentication. | +| [APIKeyAuth](../../general-usage/http/rest-client.md#api-key-authentication) | `api_key` | API key authentication with key defined in the query parameters or in the headers. | To specify the authentication configuration, use the `auth` field in the [client](#client) configuration: From 7c07c674b67c6fac86e0c1f4f1d2b00ebbe7e655 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Fri, 24 May 2024 12:53:36 +0200 Subject: [PATCH 08/15] add typing classifier (#1391) update maintainers --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4bd62ce03b..1a946f5e10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "dlt" version = "0.4.11" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] -maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Ty Dunn "] +maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] readme = "README.md" license = "Apache-2.0" homepage = "https://github.com/dlt-hub" @@ -13,6 +13,7 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Topic :: Software Development :: Libraries", + "Typing :: Typed", "Operating System :: MacOS :: MacOS X", "Operating System :: POSIX :: Linux", "Operating System :: Microsoft :: Windows",] From ceb229d6bbfcb6cd52545e3af800e8b720fd8c1f Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 24 May 2024 14:19:01 +0200 Subject: [PATCH 09/15] RESTClient: implement AuthConfigBase.__bool__ + update docs (#1398) * Fix AuthConfigBase so its instances always evaluate to True in bool context; change docs to suggest direct inheritance from AuthBase * Add tests --- dlt/sources/helpers/rest_client/auth.py | 6 ++- .../docs/general-usage/http/rest-client.md | 8 ++-- .../helpers/rest_client/test_client.py | 43 ++++++++++++++++++- 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 37c0de3db1..020c63a195 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -38,7 +38,11 @@ class AuthConfigBase(AuthBase, CredentialsConfiguration): configurable via env variables or toml files """ - pass + def __bool__(self) -> bool: + # This is needed to avoid AuthConfigBase-derived classes + # which do not implement CredentialsConfiguration interface + # to be evaluated as False in requests.sessions.Session.prepare_request() + return True @configspec diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 19cc95bf78..1093428b0f 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -407,7 +407,7 @@ The available authentication methods are defined in the `dlt.sources.helpers.res - [APIKeyAuth](#api-key-authentication) - [HttpBasicAuth](#http-basic-authentication) -For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthConfigBase` class. +For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthBase` class from the Requests library. ### Bearer token authentication @@ -479,12 +479,12 @@ response = client.get("/protected/resource") ### Implementing custom authentication -You can implement custom authentication by subclassing the `AuthConfigBase` class and implementing the `__call__` method: +You can implement custom authentication by subclassing the `AuthBase` class and implementing the `__call__` method: ```py -from dlt.sources.helpers.rest_client.auth import AuthConfigBase +from requests.auth import AuthBase -class CustomAuth(AuthConfigBase): +class CustomAuth(AuthBase): def __init__(self, token): self.token = token diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 50defa8edb..e03879b417 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -1,6 +1,7 @@ import os import pytest from typing import Any, cast +from requests.auth import AuthBase from dlt.common.typing import TSecretStrValue from dlt.sources.helpers.requests import Response, Request from dlt.sources.helpers.rest_client import RESTClient @@ -57,7 +58,6 @@ def test_page_context(self, rest_client: RESTClient) -> None: for page in rest_client.paginate( "/posts", paginator=JSONResponsePaginator(next_url_path="next_page"), - auth=AuthConfigBase(), ): # response that produced data assert isinstance(page.response, Response) @@ -183,3 +183,44 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): ) assert_pagination(list(pages_iter)) + + def test_custom_auth_success(self, rest_client: RESTClient): + class CustomAuthConfigBase(AuthConfigBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: Request) -> Request: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + class CustomAuthAuthBase(AuthBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: Request) -> Request: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + auth_list = [ + CustomAuthConfigBase("test-token"), + CustomAuthAuthBase("test-token"), + ] + + for auth in auth_list: + response = rest_client.get( + "/protected/posts/bearer-token", + auth=auth, + ) + + assert response.status_code == 200 + assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + + pages_iter = rest_client.paginate( + "/protected/posts/bearer-token", + auth=auth, + ) + + pages_list = list(pages_iter) + assert_pagination(pages_list) + + assert pages_list[0].response.request.headers["Authorization"] == "Bearer test-token" From 841c7b4634006276a8a339e9579c747e86d502b2 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 24 May 2024 18:10:27 +0200 Subject: [PATCH 10/15] Mention JSONPath in resolve docs (#1409) --- docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 54edac5062..98725627b9 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -501,11 +501,13 @@ The syntax for the `resolve` field in parameter configuration is: "": { "type": "resolve", "resource": "", - "field": "", + "field": "", } } ``` +The `field` value can be specified as a [JSONPath](https://github.com/h2non/jsonpath-ng?tab=readme-ov-file#jsonpath-syntax) to select a nested field in the parent resource data. For example: `"field": "items[0].id"`. + Under the hood, dlt handles this by using a [transformer resource](../../general-usage/resource.md#process-resources-with-dlttransformer). #### Include fields from the parent resource From 993ac37a0b6d862543e3a1c4b3a0c0d8793cdd1e Mon Sep 17 00:00:00 2001 From: rudolfix Date: Sat, 25 May 2024 11:19:53 +0200 Subject: [PATCH 11/15] Revert "RESTClient: implement AuthConfigBase.__bool__ + update docs (#1398)" (#1412) This reverts commit ceb229d6bbfcb6cd52545e3af800e8b720fd8c1f. --- dlt/sources/helpers/rest_client/auth.py | 6 +-- .../docs/general-usage/http/rest-client.md | 8 ++-- .../helpers/rest_client/test_client.py | 43 +------------------ 3 files changed, 6 insertions(+), 51 deletions(-) diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 020c63a195..37c0de3db1 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -38,11 +38,7 @@ class AuthConfigBase(AuthBase, CredentialsConfiguration): configurable via env variables or toml files """ - def __bool__(self) -> bool: - # This is needed to avoid AuthConfigBase-derived classes - # which do not implement CredentialsConfiguration interface - # to be evaluated as False in requests.sessions.Session.prepare_request() - return True + pass @configspec diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 1093428b0f..19cc95bf78 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -407,7 +407,7 @@ The available authentication methods are defined in the `dlt.sources.helpers.res - [APIKeyAuth](#api-key-authentication) - [HttpBasicAuth](#http-basic-authentication) -For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthBase` class from the Requests library. +For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthConfigBase` class. ### Bearer token authentication @@ -479,12 +479,12 @@ response = client.get("/protected/resource") ### Implementing custom authentication -You can implement custom authentication by subclassing the `AuthBase` class and implementing the `__call__` method: +You can implement custom authentication by subclassing the `AuthConfigBase` class and implementing the `__call__` method: ```py -from requests.auth import AuthBase +from dlt.sources.helpers.rest_client.auth import AuthConfigBase -class CustomAuth(AuthBase): +class CustomAuth(AuthConfigBase): def __init__(self, token): self.token = token diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index e03879b417..50defa8edb 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -1,7 +1,6 @@ import os import pytest from typing import Any, cast -from requests.auth import AuthBase from dlt.common.typing import TSecretStrValue from dlt.sources.helpers.requests import Response, Request from dlt.sources.helpers.rest_client import RESTClient @@ -58,6 +57,7 @@ def test_page_context(self, rest_client: RESTClient) -> None: for page in rest_client.paginate( "/posts", paginator=JSONResponsePaginator(next_url_path="next_page"), + auth=AuthConfigBase(), ): # response that produced data assert isinstance(page.response, Response) @@ -183,44 +183,3 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): ) assert_pagination(list(pages_iter)) - - def test_custom_auth_success(self, rest_client: RESTClient): - class CustomAuthConfigBase(AuthConfigBase): - def __init__(self, token: str): - self.token = token - - def __call__(self, request: Request) -> Request: - request.headers["Authorization"] = f"Bearer {self.token}" - return request - - class CustomAuthAuthBase(AuthBase): - def __init__(self, token: str): - self.token = token - - def __call__(self, request: Request) -> Request: - request.headers["Authorization"] = f"Bearer {self.token}" - return request - - auth_list = [ - CustomAuthConfigBase("test-token"), - CustomAuthAuthBase("test-token"), - ] - - for auth in auth_list: - response = rest_client.get( - "/protected/posts/bearer-token", - auth=auth, - ) - - assert response.status_code == 200 - assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} - - pages_iter = rest_client.paginate( - "/protected/posts/bearer-token", - auth=auth, - ) - - pages_list = list(pages_iter) - assert_pagination(pages_list) - - assert pages_list[0].response.request.headers["Authorization"] == "Bearer test-token" From 5d2d1ecb35f01dd63d1fe4d26c6efe47945ef4cd Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Sat, 25 May 2024 18:04:36 +0530 Subject: [PATCH 12/15] updated installation command in destination docs and a few others (#1410) --- docs/website/docs/dlt-ecosystem/destinations/athena.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/bigquery.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/clickhouse.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/databricks.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/dremio.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/duckdb.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/filesystem.md | 6 +++--- docs/website/docs/dlt-ecosystem/destinations/motherduck.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/mssql.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/postgres.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/qdrant.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/redshift.md | 4 ++-- docs/website/docs/dlt-ecosystem/destinations/synapse.md | 2 +- docs/website/docs/dlt-ecosystem/destinations/weaviate.md | 2 +- docs/website/docs/dlt-ecosystem/file-formats/parquet.md | 2 +- .../data-enrichments/currency_conversion_data_enrichment.md | 2 +- .../data-enrichments/url-parser-data-enrichment.md | 2 +- .../data-enrichments/user_agent_device_data_enrichment.md | 2 +- 18 files changed, 25 insertions(+), 25 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 76491578fe..7c907664d3 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -11,7 +11,7 @@ The Athena destination stores data as Parquet files in S3 buckets and creates [e ## Install dlt with Athena **To install the dlt library with Athena dependencies:** ```sh -pip install dlt[athena] +pip install "dlt[athena]" ``` ## Setup Guide @@ -30,7 +30,7 @@ First, install dependencies by running: ```sh pip install -r requirements.txt ``` -or with `pip install dlt[athena]`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages. +or with `pip install "dlt[athena]"`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages. :::caution diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 54d5abae6d..4f99901e37 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -11,7 +11,7 @@ keywords: [bigquery, destination, data warehouse] **To install the dlt library with BigQuery dependencies:** ```sh -pip install dlt[bigquery] +pip install "dlt[bigquery]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md index ea187e54eb..58551751c5 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md @@ -11,7 +11,7 @@ keywords: [ clickhouse, destination, data warehouse ] **To install the DLT library with ClickHouse dependencies:** ```sh -pip install dlt[clickhouse] +pip install "dlt[clickhouse]" ``` ## Setup Guide @@ -33,7 +33,7 @@ requirements file by executing it as follows: pip install -r requirements.txt ``` -or with `pip install dlt[clickhouse]`, which installs the `dlt` library and the necessary dependencies for working with ClickHouse as a destination. +or with `pip install "dlt[clickhouse]"`, which installs the `dlt` library and the necessary dependencies for working with ClickHouse as a destination. ### 2. Setup ClickHouse database diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index b601809935..6cd5767dcb 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -12,7 +12,7 @@ keywords: [Databricks, destination, data warehouse] ## Install dlt with Databricks **To install the dlt library with Databricks dependencies:** ```sh -pip install dlt[databricks] +pip install "dlt[databricks]" ``` ## Set up your Databricks workspace diff --git a/docs/website/docs/dlt-ecosystem/destinations/dremio.md b/docs/website/docs/dlt-ecosystem/destinations/dremio.md index 0be01e8e32..546f470938 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/dremio.md +++ b/docs/website/docs/dlt-ecosystem/destinations/dremio.md @@ -9,7 +9,7 @@ keywords: [dremio, iceberg, aws, glue catalog] ## Install dlt with Dremio **To install the dlt library with Dremio and s3 dependencies:** ```sh -pip install dlt[dremio,s3] +pip install "dlt[dremio,s3]" ``` ## Setup Guide @@ -28,7 +28,7 @@ First install dependencies by running: ```sh pip install -r requirements.txt ``` -or with `pip install dlt[dremio,s3]` which will install `s3fs`, `pyarrow`, and `botocore` packages. +or with `pip install "dlt[dremio,s3]"` which will install `s3fs`, `pyarrow`, and `botocore` packages. To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You will need to provide a `bucket_url` which holds the uploaded parquet files. diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 22c5fd1df9..c2f6786f8d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -9,7 +9,7 @@ keywords: [duckdb, destination, data warehouse] ## Install dlt with DuckDB **To install the dlt library with DuckDB dependencies, run:** ```sh -pip install dlt[duckdb] +pip install "dlt[duckdb]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 0d719b4cfa..4c62e172d8 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -6,7 +6,7 @@ The Filesystem destination stores data in remote file systems and bucket storage ## Install dlt with filesystem **To install the dlt library with filesystem dependencies:** ```sh -pip install dlt[filesystem] +pip install "dlt[filesystem]" ``` This installs `s3fs` and `botocore` packages. @@ -125,7 +125,7 @@ client_kwargs = '{"verify": "public.crt"}' ``` #### Google Storage -Run `pip install dlt[gs]` which will install the `gcfs` package. +Run `pip install "dlt[gs]"` which will install the `gcfs` package. To edit the `dlt` credentials file with your secret info, open `.dlt/secrets.toml`. You'll see AWS credentials by default. @@ -148,7 +148,7 @@ if you have default google cloud credentials in your environment (i.e. on cloud Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Object Admin** role to your service account. #### Azure Blob Storage -Run `pip install dlt[az]` which will install the `adlfs` package to interface with Azure Blob Storage. +Run `pip install "dlt[az]"` which will install the `adlfs` package to interface with Azure Blob Storage. Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials: diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md index b053d29ac1..9d8c8d260b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md +++ b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md @@ -10,7 +10,7 @@ keywords: [MotherDuck, duckdb, destination, data warehouse] ## Install dlt with MotherDuck **To install the dlt library with MotherDuck dependencies:** ```sh -pip install dlt[motherduck] +pip install "dlt[motherduck]" ``` :::tip diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 3e5b209aaa..1cba484f10 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -9,7 +9,7 @@ keywords: [mssql, sqlserver, destination, data warehouse] ## Install dlt with MS SQL **To install the dlt library with MS SQL dependencies, use:** ```sh -pip install dlt[mssql] +pip install "dlt[mssql]" ``` ## Setup guide @@ -38,7 +38,7 @@ pip install -r requirements.txt ``` or run: ```sh -pip install dlt[mssql] +pip install "dlt[mssql]" ``` This will install `dlt` with the `mssql` extra, which contains all the dependencies required by the SQL server client. diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index 5126272e37..ae504728c3 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -9,7 +9,7 @@ keywords: [postgres, destination, data warehouse] ## Install dlt with PostgreSQL **To install the dlt library with PostgreSQL dependencies, run:** ```sh -pip install dlt[postgres] +pip install "dlt[postgres]" ``` ## Setup Guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index 1b560ad6fe..9f19007227 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -14,7 +14,7 @@ This destination helps you load data into Qdrant from [dlt resources](../../gene 1. To use Qdrant as a destination, make sure `dlt` is installed with the `qdrant` extra: ```sh -pip install dlt[qdrant] +pip install "dlt[qdrant]" ``` 2. Next, configure the destination in the dlt secrets file. The file is located at `~/.dlt/secrets.toml` by default. Add the following section to the secrets file: diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index 349698d201..7e0679ec6b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -9,7 +9,7 @@ keywords: [redshift, destination, data warehouse] ## Install dlt with Redshift **To install the dlt library with Redshift dependencies:** ```sh -pip install dlt[redshift] +pip install "dlt[redshift]" ``` ## Setup Guide @@ -26,7 +26,7 @@ The above command generates several files and directories, including `.dlt/secre ```sh pip install -r requirements.txt ``` -or with `pip install dlt[redshift]`, which installs the `dlt` library and the necessary dependencies for working with Amazon Redshift as a destination. +or with `pip install "dlt[redshift]"`, which installs the `dlt` library and the necessary dependencies for working with Amazon Redshift as a destination. ### 2. Setup Redshift cluster To load data into Redshift, you need to create a Redshift cluster and enable access to your IP address through the VPC inbound rules associated with the cluster. While we recommend asking our GPT-4 assistant for details, we have provided a general outline of the process below: diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index f1c43b4d54..2e936f193e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -9,7 +9,7 @@ keywords: [synapse, destination, data warehouse] ## Install dlt with Synapse **To install the dlt library with Synapse dependencies:** ```sh -pip install dlt[synapse] +pip install "dlt[synapse]" ``` ## Setup guide diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 1272b16c86..11d1276ceb 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -14,7 +14,7 @@ This destination helps you load data into Weaviate from [dlt resources](../../ge 1. To use Weaviate as a destination, make sure dlt is installed with the 'weaviate' extra: ```sh -pip install dlt[weaviate] +pip install "dlt[weaviate]" ``` 2. Next, configure the destination in the dlt secrets file. The file is located at `~/.dlt/secrets.toml` by default. Add the following section to the secrets file: diff --git a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md index 8944b7d5fa..414eaf2cb8 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/parquet.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/parquet.md @@ -11,7 +11,7 @@ keywords: [parquet, file formats] To use this format, you need a `pyarrow` package. You can get this package as a `dlt` extra as well: ```sh -pip install dlt[parquet] +pip install "dlt[parquet]" ``` ## Supported Destinations diff --git a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md index f8bd179422..82297420ed 100644 --- a/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/currency_conversion_data_enrichment.md @@ -247,7 +247,7 @@ API token. [destination](../../dlt-ecosystem/destinations/), For example, duckdb: ```sh - pip install dlt[duckdb] + pip install "dlt[duckdb]" ``` 1. Run the pipeline with the following command: diff --git a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md index ab71d3d1d0..f2cd4a1065 100644 --- a/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/url-parser-data-enrichment.md @@ -231,7 +231,7 @@ need to register to use this service neither get an API key. [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: ```sh - pip install dlt[duckdb] + pip install "dlt[duckdb]" ``` 1. Run the pipeline with the following command: diff --git a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md index 3aadb2f982..2448d31a06 100644 --- a/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md +++ b/docs/website/docs/general-usage/data-enrichments/user_agent_device_data_enrichment.md @@ -284,7 +284,7 @@ The first step is to register on [SerpAPI](https://serpapi.com/) and obtain the [destination](https://dlthub.com/docs/dlt-ecosystem/destinations/), For example, duckdb: ```sh - pip install dlt[duckdb] + pip install "dlt[duckdb]" ``` 1. Run the pipeline with the following command: From 67ccacd81708d2775d453e97c4f80a40b179eaba Mon Sep 17 00:00:00 2001 From: rudolfix Date: Sat, 25 May 2024 15:47:51 +0200 Subject: [PATCH 13/15] Feat/1406 bumps duckdb 0.10 (#1407) * bumps duckdb to 0.10.x * updates dbt support to 1.8 --- dlt/helpers/dbt/__init__.py | 2 +- dlt/helpers/dbt/dbt_utils.py | 12 -- .../custom_destination_bigquery.py | 4 +- poetry.lock | 150 ++++++------------ pyproject.toml | 10 +- .../dbt_tests/test_runner_dbt_versions.py | 11 +- 6 files changed, 65 insertions(+), 124 deletions(-) diff --git a/dlt/helpers/dbt/__init__.py b/dlt/helpers/dbt/__init__.py index b555bcd3a9..4801dcd6b9 100644 --- a/dlt/helpers/dbt/__init__.py +++ b/dlt/helpers/dbt/__init__.py @@ -11,7 +11,7 @@ from dlt.helpers.dbt.runner import create_runner, DBTPackageRunner -DEFAULT_DBT_VERSION = ">=1.1,<1.6" +DEFAULT_DBT_VERSION = ">=1.5,<1.9" # a map of destination names to dbt package names in case they don't match the pure destination name DBT_DESTINATION_MAP = { diff --git a/dlt/helpers/dbt/dbt_utils.py b/dlt/helpers/dbt/dbt_utils.py index bf14504eaa..80774d9858 100644 --- a/dlt/helpers/dbt/dbt_utils.py +++ b/dlt/helpers/dbt/dbt_utils.py @@ -24,7 +24,6 @@ # https://stackoverflow.com/questions/48619517/call-a-click-command-from-code import dbt.logger - from dbt.events import functions from dbt.contracts import results as dbt_results except ModuleNotFoundError: raise MissingDependencyException("DBT Core", ["dbt-core"]) @@ -56,17 +55,6 @@ def set_path_wrapper(self: dbt.logger.LogManager, path: str) -> None: self._file_handler.set_path(path) _DBT_LOGGER_INITIALIZED = True - # def setup_event_logger_wrapper(log_path: str, level_override:str = None) -> None: - # global _DBT_LOGGER_INITIALIZED - - # if not _DBT_LOGGER_INITIALIZED: - # functions.setup_event_logger(log_path, level.lower()) - # # force log level as file is debug only - # # functions.this.FILE_LOG.setLevel(level) - # # functions.this.FILE_LOG.handlers[0].setLevel(level) - # _DBT_LOGGER_INITIALIZED = True - - # dbt.main.setup_event_logger = setup_event_logger_wrapper dbt.logger.LogManager.set_path = set_path_wrapper # type: ignore globs = [] diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py index e890469263..380912a9a7 100644 --- a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -60,7 +60,9 @@ def resource(url: str): # dlt bigquery custom destination # we can use the dlt provided credentials class # to retrieve the gcp credentials from the secrets -@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case") +@dlt.destination( + name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case" +) def bigquery_insert( items, table=BIGQUERY_TABLE_ID, credentials: GcpServiceAccountCredentials = dlt.secrets.value ) -> None: diff --git a/poetry.lock b/poetry.lock index dcab5e1730..6159f751c4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2548,106 +2548,58 @@ dates = ["pytz (>=2019.1)"] [[package]] name = "duckdb" -version = "0.9.2" -description = "DuckDB embedded database" -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "duckdb-0.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aadcea5160c586704c03a8a796c06a8afffbefefb1986601104a60cb0bfdb5ab"}, - {file = "duckdb-0.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:08215f17147ed83cbec972175d9882387366de2ed36c21cbe4add04b39a5bcb4"}, - {file = "duckdb-0.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ee6c2a8aba6850abef5e1be9dbc04b8e72a5b2c2b67f77892317a21fae868fe7"}, - {file = "duckdb-0.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ff49f3da9399900fd58b5acd0bb8bfad22c5147584ad2427a78d937e11ec9d0"}, - {file = "duckdb-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd5ac5baf8597efd2bfa75f984654afcabcd698342d59b0e265a0bc6f267b3f0"}, - {file = "duckdb-0.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:81c6df905589a1023a27e9712edb5b724566587ef280a0c66a7ec07c8083623b"}, - {file = "duckdb-0.9.2-cp310-cp310-win32.whl", hash = "sha256:a298cd1d821c81d0dec8a60878c4b38c1adea04a9675fb6306c8f9083bbf314d"}, - {file = "duckdb-0.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:492a69cd60b6cb4f671b51893884cdc5efc4c3b2eb76057a007d2a2295427173"}, - {file = "duckdb-0.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:061a9ea809811d6e3025c5de31bc40e0302cfb08c08feefa574a6491e882e7e8"}, - {file = "duckdb-0.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a43f93be768af39f604b7b9b48891f9177c9282a408051209101ff80f7450d8f"}, - {file = "duckdb-0.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ac29c8c8f56fff5a681f7bf61711ccb9325c5329e64f23cb7ff31781d7b50773"}, - {file = "duckdb-0.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b14d98d26bab139114f62ade81350a5342f60a168d94b27ed2c706838f949eda"}, - {file = "duckdb-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:796a995299878913e765b28cc2b14c8e44fae2f54ab41a9ee668c18449f5f833"}, - {file = "duckdb-0.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6cb64ccfb72c11ec9c41b3cb6181b6fd33deccceda530e94e1c362af5f810ba1"}, - {file = "duckdb-0.9.2-cp311-cp311-win32.whl", hash = "sha256:930740cb7b2cd9e79946e1d3a8f66e15dc5849d4eaeff75c8788d0983b9256a5"}, - {file = "duckdb-0.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:c28f13c45006fd525001b2011cdf91fa216530e9751779651e66edc0e446be50"}, - {file = "duckdb-0.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fbce7bbcb4ba7d99fcec84cec08db40bc0dd9342c6c11930ce708817741faeeb"}, - {file = "duckdb-0.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15a82109a9e69b1891f0999749f9e3265f550032470f51432f944a37cfdc908b"}, - {file = "duckdb-0.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9490fb9a35eb74af40db5569d90df8a04a6f09ed9a8c9caa024998c40e2506aa"}, - {file = "duckdb-0.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:696d5c6dee86c1a491ea15b74aafe34ad2b62dcd46ad7e03b1d00111ca1a8c68"}, - {file = "duckdb-0.9.2-cp37-cp37m-win32.whl", hash = "sha256:4f0935300bdf8b7631ddfc838f36a858c1323696d8c8a2cecbd416bddf6b0631"}, - {file = "duckdb-0.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:0aab900f7510e4d2613263865570203ddfa2631858c7eb8cbed091af6ceb597f"}, - {file = "duckdb-0.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7d8130ed6a0c9421b135d0743705ea95b9a745852977717504e45722c112bf7a"}, - {file = "duckdb-0.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:974e5de0294f88a1a837378f1f83330395801e9246f4e88ed3bfc8ada65dcbee"}, - {file = "duckdb-0.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4fbc297b602ef17e579bb3190c94d19c5002422b55814421a0fc11299c0c1100"}, - {file = "duckdb-0.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1dd58a0d84a424924a35b3772419f8cd78a01c626be3147e4934d7a035a8ad68"}, - {file = "duckdb-0.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11a1194a582c80dfb57565daa06141727e415ff5d17e022dc5f31888a5423d33"}, - {file = "duckdb-0.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:be45d08541002a9338e568dca67ab4f20c0277f8f58a73dfc1435c5b4297c996"}, - {file = "duckdb-0.9.2-cp38-cp38-win32.whl", hash = "sha256:dd6f88aeb7fc0bfecaca633629ff5c986ac966fe3b7dcec0b2c48632fd550ba2"}, - {file = "duckdb-0.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:28100c4a6a04e69aa0f4a6670a6d3d67a65f0337246a0c1a429f3f28f3c40b9a"}, - {file = "duckdb-0.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ae5bf0b6ad4278e46e933e51473b86b4b932dbc54ff097610e5b482dd125552"}, - {file = "duckdb-0.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e5d0bb845a80aa48ed1fd1d2d285dd352e96dc97f8efced2a7429437ccd1fe1f"}, - {file = "duckdb-0.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ce262d74a52500d10888110dfd6715989926ec936918c232dcbaddb78fc55b4"}, - {file = "duckdb-0.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6935240da090a7f7d2666f6d0a5e45ff85715244171ca4e6576060a7f4a1200e"}, - {file = "duckdb-0.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5cfb93e73911696a98b9479299d19cfbc21dd05bb7ab11a923a903f86b4d06e"}, - {file = "duckdb-0.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64e3bc01751f31e7572d2716c3e8da8fe785f1cdc5be329100818d223002213f"}, - {file = "duckdb-0.9.2-cp39-cp39-win32.whl", hash = "sha256:6e5b80f46487636368e31b61461940e3999986359a78660a50dfdd17dd72017c"}, - {file = "duckdb-0.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:e6142a220180dbeea4f341708bd5f9501c5c962ce7ef47c1cadf5e8810b4cb13"}, - {file = "duckdb-0.9.2.tar.gz", hash = "sha256:3843afeab7c3fc4a4c0b53686a4cc1d9cdbdadcbb468d60fef910355ecafd447"}, -] - -[[package]] -name = "duckdb" -version = "0.10.0" +version = "0.10.3" description = "DuckDB in-process database" optional = false python-versions = ">=3.7.0" files = [ - {file = "duckdb-0.10.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bd0ffb3fddef0f72a150e4d76e10942a84a1a0447d10907df1621b90d6668060"}, - {file = "duckdb-0.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f3d709d5c7c1a12b5e10d0b05fa916c670cd2b50178e3696faa0cc16048a1745"}, - {file = "duckdb-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9114aa22ec5d591a20ce5184be90f49d8e5b5348ceaab21e102c54560d07a5f8"}, - {file = "duckdb-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77a37877efadf39caf7cadde0f430fedf762751b9c54750c821e2f1316705a21"}, - {file = "duckdb-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87cbc9e1d9c3fc9f14307bea757f99f15f46843c0ab13a6061354410824ed41f"}, - {file = "duckdb-0.10.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0bfec79fed387201550517d325dff4fad2705020bc139d936cab08b9e845662"}, - {file = "duckdb-0.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c5622134d2d9796b15e09de810e450859d4beb46d9b861357ec9ae40a61b775c"}, - {file = "duckdb-0.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:089ee8e831ccaef1b73fc89c43b661567175eed0115454880bafed5e35cda702"}, - {file = "duckdb-0.10.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a05af63747f1d7021995f0811c333dee7316cec3b06c0d3e4741b9bdb678dd21"}, - {file = "duckdb-0.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:072d6eba5d8a59e0069a8b5b4252fed8a21f9fe3f85a9129d186a39b3d0aea03"}, - {file = "duckdb-0.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a77b85668f59b919042832e4659538337f1c7f197123076c5311f1c9cf077df7"}, - {file = "duckdb-0.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96a666f1d2da65d03199a977aec246920920a5ea1da76b70ae02bd4fb1ffc48c"}, - {file = "duckdb-0.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ec76a4262b783628d26612d184834852d9c92fb203e91af789100c17e3d7173"}, - {file = "duckdb-0.10.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:009dd9d2cdbd3b061a9efbdfc79f2d1a8377bcf49f1e5f430138621f8c083a6c"}, - {file = "duckdb-0.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:878f06766088090dad4a2e5ee0081555242b2e8dcb29415ecc97e388cf0cf8d8"}, - {file = "duckdb-0.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:713ff0a1fb63a6d60f454acf67f31656549fb5d63f21ac68314e4f522daa1a89"}, - {file = "duckdb-0.10.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9c0ee450dfedfb52dd4957244e31820feef17228da31af6d052979450a80fd19"}, - {file = "duckdb-0.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ff79b2ea9994398b545c0d10601cd73565fbd09f8951b3d8003c7c5c0cebc7cb"}, - {file = "duckdb-0.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6bdf1aa71b924ef651062e6b8ff9981ad85bec89598294af8a072062c5717340"}, - {file = "duckdb-0.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0265bbc8216be3ced7b377ba8847128a3fc0ef99798a3c4557c1b88e3a01c23"}, - {file = "duckdb-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d418a315a07707a693bd985274c0f8c4dd77015d9ef5d8d3da4cc1942fd82e0"}, - {file = "duckdb-0.10.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2828475a292e68c71855190b818aded6bce7328f79e38c04a0c75f8f1c0ceef0"}, - {file = "duckdb-0.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c3aaeaae2eba97035c65f31ffdb18202c951337bf2b3d53d77ce1da8ae2ecf51"}, - {file = "duckdb-0.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:c51790aaaea97d8e4a58a114c371ed8d2c4e1ca7cbf29e3bdab6d8ccfc5afc1e"}, - {file = "duckdb-0.10.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8af1ae7cc77a12206b6c47ade191882cc8f49f750bb3e72bb86ac1d4fa89926a"}, - {file = "duckdb-0.10.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa4f7e8e8dc0e376aeb280b83f2584d0e25ec38985c27d19f3107b2edc4f4a97"}, - {file = "duckdb-0.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28ae942a79fad913defa912b56483cd7827a4e7721f4ce4bc9025b746ecb3c89"}, - {file = "duckdb-0.10.0-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01b57802898091455ca2a32c1335aac1e398da77c99e8a96a1e5de09f6a0add9"}, - {file = "duckdb-0.10.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:52e1ad4a55fa153d320c367046b9500578192e01c6d04308ba8b540441736f2c"}, - {file = "duckdb-0.10.0-cp37-cp37m-win_amd64.whl", hash = "sha256:904c47d04095af745e989c853f0bfc0776913dfc40dfbd2da7afdbbb5f67fed0"}, - {file = "duckdb-0.10.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:184ae7ea5874f3b8fa51ab0f1519bdd088a0b78c32080ee272b1d137e2c8fd9c"}, - {file = "duckdb-0.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bd33982ecc9bac727a032d6cedced9f19033cbad56647147408891eb51a6cb37"}, - {file = "duckdb-0.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f59bf0949899105dd5f8864cb48139bfb78454a8c017b8258ba2b5e90acf7afc"}, - {file = "duckdb-0.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:395f3b18948001e35dceb48a4423d574e38656606d033eef375408b539e7b076"}, - {file = "duckdb-0.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b8eb2b803be7ee1df70435c33b03a4598cdaf676cd67ad782b288dcff65d781"}, - {file = "duckdb-0.10.0-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:31b2ddd331801064326c8e3587a4db8a31d02aef11332c168f45b3bd92effb41"}, - {file = "duckdb-0.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c8b89e76a041424b8c2026c5dc1f74b53fbbc6c6f650d563259885ab2e7d093d"}, - {file = "duckdb-0.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:79084a82f16c0a54f6bfb7ded5600400c2daa90eb0d83337d81a56924eaee5d4"}, - {file = "duckdb-0.10.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:79799b3a270dcd9070f677ba510f1e66b112df3068425691bac97c5e278929c7"}, - {file = "duckdb-0.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8fc394bfe3434920cdbcfbdd0ac3ba40902faa1dbda088db0ba44003a45318a"}, - {file = "duckdb-0.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c116605551b4abf5786243a59bcef02bd69cc51837d0c57cafaa68cdc428aa0c"}, - {file = "duckdb-0.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3191170c3b0a43b0c12644800326f5afdea00d5a4621d59dbbd0c1059139e140"}, - {file = "duckdb-0.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fee69a50eb93c72dc77e7ab1fabe0c38d21a52c5da44a86aa217081e38f9f1bd"}, - {file = "duckdb-0.10.0-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c5f449e87dacb16b0d145dbe65fa6fdb5a55b2b6911a46d74876e445dd395bac"}, - {file = "duckdb-0.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4487d0df221b17ea4177ad08131bc606b35f25cfadf890987833055b9d10cdf6"}, - {file = "duckdb-0.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:c099ae2ff8fe939fda62da81704f91e2f92ac45e48dc0e37c679c9d243d01e65"}, - {file = "duckdb-0.10.0.tar.gz", hash = "sha256:c02bcc128002aa79e3c9d89b9de25e062d1096a8793bc0d7932317b7977f6845"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd25cc8d001c09a19340739ba59d33e12a81ab285b7a6bed37169655e1cefb31"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f9259c637b917ca0f4c63887e8d9b35ec248f5d987c886dfc4229d66a791009"}, + {file = "duckdb-0.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b48f5f1542f1e4b184e6b4fc188f497be8b9c48127867e7d9a5f4a3e334f88b0"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e327f7a3951ea154bb56e3fef7da889e790bd9a67ca3c36afc1beb17d3feb6d6"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d8b20ed67da004b4481973f4254fd79a0e5af957d2382eac8624b5c527ec48c"}, + {file = "duckdb-0.10.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d37680b8d7be04e4709db3a66c8b3eb7ceba2a5276574903528632f2b2cc2e60"}, + {file = "duckdb-0.10.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d34b86d6a2a6dfe8bb757f90bfe7101a3bd9e3022bf19dbddfa4b32680d26a9"}, + {file = "duckdb-0.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:73b1cb283ca0f6576dc18183fd315b4e487a545667ffebbf50b08eb4e8cdc143"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d917dde19fcec8cadcbef1f23946e85dee626ddc133e1e3f6551f15a61a03c61"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46757e0cf5f44b4cb820c48a34f339a9ccf83b43d525d44947273a585a4ed822"}, + {file = "duckdb-0.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:338c14d8ac53ac4aa9ec03b6f1325ecfe609ceeb72565124d489cb07f8a1e4eb"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:651fcb429602b79a3cf76b662a39e93e9c3e6650f7018258f4af344c816dab72"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3ae3c73b98b6215dab93cc9bc936b94aed55b53c34ba01dec863c5cab9f8e25"}, + {file = "duckdb-0.10.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56429b2cfe70e367fb818c2be19f59ce2f6b080c8382c4d10b4f90ba81f774e9"}, + {file = "duckdb-0.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b46c02c2e39e3676b1bb0dc7720b8aa953734de4fd1b762e6d7375fbeb1b63af"}, + {file = "duckdb-0.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:bcd460feef56575af2c2443d7394d405a164c409e9794a4d94cb5fdaa24a0ba4"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:e229a7c6361afbb0d0ab29b1b398c10921263c52957aefe3ace99b0426fdb91e"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:732b1d3b6b17bf2f32ea696b9afc9e033493c5a3b783c292ca4b0ee7cc7b0e66"}, + {file = "duckdb-0.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f5380d4db11fec5021389fb85d614680dc12757ef7c5881262742250e0b58c75"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:468a4e0c0b13c55f84972b1110060d1b0f854ffeb5900a178a775259ec1562db"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa1e7ff8d18d71defa84e79f5c86aa25d3be80d7cb7bc259a322de6d7cc72da"}, + {file = "duckdb-0.10.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed1063ed97c02e9cf2e7fd1d280de2d1e243d72268330f45344c69c7ce438a01"}, + {file = "duckdb-0.10.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:22f2aad5bb49c007f3bfcd3e81fdedbc16a2ae41f2915fc278724ca494128b0c"}, + {file = "duckdb-0.10.3-cp312-cp312-win_amd64.whl", hash = "sha256:8f9e2bb00a048eb70b73a494bdc868ce7549b342f7ffec88192a78e5a4e164bd"}, + {file = "duckdb-0.10.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6c2fc49875b4b54e882d68703083ca6f84b27536d57d623fc872e2f502b1078"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a66c125d0c30af210f7ee599e7821c3d1a7e09208196dafbf997d4e0cfcb81ab"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d99dd7a1d901149c7a276440d6e737b2777e17d2046f5efb0c06ad3b8cb066a6"}, + {file = "duckdb-0.10.3-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5ec3bbdb209e6095d202202893763e26c17c88293b88ef986b619e6c8b6715bd"}, + {file = "duckdb-0.10.3-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:2b3dec4ef8ed355d7b7230b40950b30d0def2c387a2e8cd7efc80b9d14134ecf"}, + {file = "duckdb-0.10.3-cp37-cp37m-win_amd64.whl", hash = "sha256:04129f94fb49bba5eea22f941f0fb30337f069a04993048b59e2811f52d564bc"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d75d67024fc22c8edfd47747c8550fb3c34fb1cbcbfd567e94939ffd9c9e3ca7"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3796e9507c02d0ddbba2e84c994fae131da567ce3d9cbb4cbcd32fadc5fbb26"}, + {file = "duckdb-0.10.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:78e539d85ebd84e3e87ec44d28ad912ca4ca444fe705794e0de9be3dd5550c11"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a99b67ac674b4de32073e9bc604b9c2273d399325181ff50b436c6da17bf00a"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1209a354a763758c4017a1f6a9f9b154a83bed4458287af9f71d84664ddb86b6"}, + {file = "duckdb-0.10.3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3b735cea64aab39b67c136ab3a571dbf834067f8472ba2f8bf0341bc91bea820"}, + {file = "duckdb-0.10.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:816ffb9f758ed98eb02199d9321d592d7a32a6cb6aa31930f4337eb22cfc64e2"}, + {file = "duckdb-0.10.3-cp38-cp38-win_amd64.whl", hash = "sha256:1631184b94c3dc38b13bce4045bf3ae7e1b0ecbfbb8771eb8d751d8ffe1b59b3"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fb98c35fc8dd65043bc08a2414dd9f59c680d7e8656295b8969f3f2061f26c52"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7e75c9f5b6a92b2a6816605c001d30790f6d67ce627a2b848d4d6040686efdf9"}, + {file = "duckdb-0.10.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae786eddf1c2fd003466e13393b9348a44b6061af6fe7bcb380a64cac24e7df7"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9387da7b7973707b0dea2588749660dd5dd724273222680e985a2dd36787668"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:538f943bf9fa8a3a7c4fafa05f21a69539d2c8a68e557233cbe9d989ae232899"}, + {file = "duckdb-0.10.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6930608f35025a73eb94252964f9f19dd68cf2aaa471da3982cf6694866cfa63"}, + {file = "duckdb-0.10.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:03bc54a9cde5490918aad82d7d2a34290e3dfb78d5b889c6626625c0f141272a"}, + {file = "duckdb-0.10.3-cp39-cp39-win_amd64.whl", hash = "sha256:372b6e3901d85108cafe5df03c872dfb6f0dbff66165a0cf46c47246c1957aa0"}, + {file = "duckdb-0.10.3.tar.gz", hash = "sha256:c5bd84a92bc708d3a6adffe1f554b94c6e76c795826daaaf482afc3d9c636971"}, ] [[package]] @@ -9317,11 +9269,11 @@ clickhouse = ["adlfs", "clickhouse-connect", "clickhouse-driver", "gcsfs", "pyar databricks = ["databricks-sql-connector"] dbt = ["dbt-athena-community", "dbt-bigquery", "dbt-core", "dbt-databricks", "dbt-duckdb", "dbt-redshift", "dbt-snowflake"] dremio = ["pyarrow"] -duckdb = ["duckdb", "duckdb"] +duckdb = ["duckdb"] filesystem = ["botocore", "s3fs"] gcp = ["gcsfs", "google-cloud-bigquery", "grpcio"] gs = ["gcsfs"] -motherduck = ["duckdb", "duckdb", "pyarrow"] +motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] postgres = ["psycopg2-binary", "psycopg2cffi"] @@ -9335,4 +9287,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "c206bfd3eab8f0c9349398c3c0ed251490bab96254327cb800d45807f05d2997" +content-hash = "605b9b04ed3ae8b71c41eaf532d7bc8ce4f8135ef00593b5f01a82debc3e14c8" diff --git a/pyproject.toml b/pyproject.toml index 1a946f5e10..8beefe409f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,10 +58,12 @@ psycopg2cffi = {version = ">=2.9.0", optional = true, markers="platform_python_i grpcio = {version = ">=1.50.0", optional = true} google-cloud-bigquery = {version = ">=2.26.0", optional = true} pyarrow = {version = ">=12.0.0", optional = true} -duckdb = [ - {version = ">=0.6.1,<0.10.0", python = ">=3.8,<3.12", optional = true}, - {version = ">=0.10.0,<0.11.0", python = ">=3.12", optional = true} -] +duckdb = {version = ">=0.6.1,<0.11", optional = true} +# keep per-python version dependency as a reference +# duckdb = [ +# {version = ">=0.6.1,<0.10.0", python = ">=3.8,<3.12", optional = true}, +# {version = ">=0.10.0,<0.11.0", python = ">=3.12", optional = true} +# ] dbt-core = {version = ">=1.2.0", optional = true} dbt-redshift = {version = ">=1.2.0", optional = true} dbt-bigquery = {version = ">=1.2.0", optional = true} diff --git a/tests/helpers/dbt_tests/test_runner_dbt_versions.py b/tests/helpers/dbt_tests/test_runner_dbt_versions.py index 5b9b07fcc5..a82345d732 100644 --- a/tests/helpers/dbt_tests/test_runner_dbt_versions.py +++ b/tests/helpers/dbt_tests/test_runner_dbt_versions.py @@ -43,16 +43,13 @@ def client() -> Iterator[PostgresClient]: PACKAGE_PARAMS = [ - # ("postgres", "1.1.3"), - # ("postgres", "1.2.4"), - # ("postgres", "1.3.2"), - # ("postgres", "1.4.0"), ("postgres", "1.5.2"), ("postgres", "1.6.13"), + ("postgres", "1.8.1"), ("postgres", None), - # ("snowflake", "1.4.0"), ("snowflake", "1.5.2"), ("snowflake", "1.6.13"), + ("snowflake", "1.8.1"), ("snowflake", None), ] PACKAGE_IDS = [ @@ -82,10 +79,10 @@ def test_infer_venv_deps() -> None: # provide version ranges requirements = _create_dbt_deps(["duckdb"], dbt_version=">3") # special duckdb dependency - assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==0.9.2"] + assert requirements[:-1] == ["dbt-core>3", "dbt-duckdb", "duckdb==0.10.3"] # we do not validate version ranges, pip will do it and fail when creating venv requirements = _create_dbt_deps(["motherduck"], dbt_version="y") - assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==0.9.2"] + assert requirements[:-1] == ["dbt-corey", "dbt-duckdb", "duckdb==0.10.3"] def test_default_profile_name() -> None: From 3dc187475546e209be7d3025f4378d43e253a5bd Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Sat, 25 May 2024 11:28:12 -0400 Subject: [PATCH 14/15] Azure service principal credentials (#1377) * Support azure service principal credentials * Add sp credentials to docs * Test resolved fsspec instance * Update filesystem config test * Test connect with service credentials * configures destination credentials * fixes toml typo --------- Co-authored-by: Marcin Rudolf --- dlt/common/configuration/specs/__init__.py | 11 +- .../configuration/specs/azure_credentials.py | 50 +++++++- dlt/common/storages/configuration.py | 11 +- .../dlt-ecosystem/destinations/filesystem.md | 24 +++- .../load/filesystem/test_azure_credentials.py | 110 +++++++++++++++++- .../load/filesystem/test_filesystem_common.py | 10 +- 6 files changed, 191 insertions(+), 25 deletions(-) diff --git a/dlt/common/configuration/specs/__init__.py b/dlt/common/configuration/specs/__init__.py index 9acf14bde3..f1d7d819ff 100644 --- a/dlt/common/configuration/specs/__init__.py +++ b/dlt/common/configuration/specs/__init__.py @@ -20,7 +20,13 @@ from .connection_string_credentials import ConnectionStringCredentials from .api_credentials import OAuth2Credentials from .aws_credentials import AwsCredentials, AwsCredentialsWithoutDefaults -from .azure_credentials import AzureCredentials, AzureCredentialsWithoutDefaults +from .azure_credentials import ( + AzureCredentials, + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentials, + AzureServicePrincipalCredentialsWithoutDefaults, + AnyAzureCredentials, +) # backward compatibility for service account credentials @@ -51,6 +57,9 @@ "AwsCredentialsWithoutDefaults", "AzureCredentials", "AzureCredentialsWithoutDefaults", + "AzureServicePrincipalCredentials", + "AzureServicePrincipalCredentialsWithoutDefaults", + "AnyAzureCredentials", "GcpClientCredentials", "GcpClientCredentialsWithDefault", ] diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index 52d33ec0d3..8b8fc259f2 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Union from dlt.common.pendulum import pendulum from dlt.common.typing import TSecretStrValue @@ -7,10 +7,6 @@ CredentialsWithDefault, configspec, ) -from dlt.common.configuration.specs.exceptions import InvalidBoto3Session -from dlt import version - -import fsspec @configspec @@ -50,6 +46,22 @@ def on_partial(self) -> None: self.resolve() +@configspec +class AzureServicePrincipalCredentialsWithoutDefaults(CredentialsConfiguration): + azure_storage_account_name: str = None + azure_tenant_id: str = None + azure_client_id: str = None + azure_client_secret: TSecretStrValue = None + + def to_adlfs_credentials(self) -> Dict[str, Any]: + return dict( + account_name=self.azure_storage_account_name, + tenant_id=self.azure_tenant_id, + client_id=self.azure_client_id, + client_secret=self.azure_client_secret, + ) + + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): def on_partial(self) -> None: @@ -67,3 +79,31 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: if self.has_default_credentials(): base_kwargs["anon"] = False return base_kwargs + + +@configspec +class AzureServicePrincipalCredentials( + AzureServicePrincipalCredentialsWithoutDefaults, CredentialsWithDefault +): + def on_partial(self) -> None: + from azure.identity import DefaultAzureCredential + + self._set_default_credentials(DefaultAzureCredential()) + if self.azure_storage_account_name: + self.resolve() + + def to_adlfs_credentials(self) -> Dict[str, Any]: + base_kwargs = super().to_adlfs_credentials() + if self.has_default_credentials(): + base_kwargs["anon"] = False + return base_kwargs + + +AnyAzureCredentials = Union[ + # Credentials without defaults come first because union types are attempted in order + # and explicit config should supersede system defaults + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentialsWithoutDefaults, + AzureCredentials, + AzureServicePrincipalCredentials, +] diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index a1838fab6e..6e100536af 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -10,8 +10,7 @@ GcpServiceAccountCredentials, AwsCredentials, GcpOAuthCredentials, - AzureCredentials, - AzureCredentialsWithoutDefaults, + AnyAzureCredentials, BaseConfiguration, ) from dlt.common.typing import DictStrAny @@ -49,7 +48,7 @@ class LoadStorageConfiguration(BaseConfiguration): FileSystemCredentials = Union[ - AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials + AwsCredentials, GcpServiceAccountCredentials, AnyAzureCredentials, GcpOAuthCredentials ] @@ -70,9 +69,9 @@ class FilesystemConfiguration(BaseConfiguration): "gcs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], "gdrive": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], "s3": AwsCredentials, - "az": Union[AzureCredentialsWithoutDefaults, AzureCredentials], - "abfs": Union[AzureCredentialsWithoutDefaults, AzureCredentials], - "adl": Union[AzureCredentialsWithoutDefaults, AzureCredentials], + "az": AnyAzureCredentials, + "abfs": AnyAzureCredentials, + "adl": AnyAzureCredentials, } bucket_url: str = None diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 4c62e172d8..3e2e08013c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -150,7 +150,13 @@ Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Ob #### Azure Blob Storage Run `pip install "dlt[az]"` which will install the `adlfs` package to interface with Azure Blob Storage. -Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials: +Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials. + +Two forms of Azure credentials are supported: + +##### SAS token credentials + +Supply storage account name and either sas token or storage account key ```toml [destination.filesystem] @@ -168,6 +174,20 @@ If you have the correct Azure credentials set up on your machine (e.g. via azure you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fall back to the available default. Note that `azure_storage_account_name` is still required as it can't be inferred from the environment. +##### Service principal credentials + +Supply a client ID, client secret and a tenant ID for a service principal authorized to access your container + +```toml +[destination.filesystem] +bucket_url = "az://[your_container name]" # replace with your container name + +[destination.filesystem.credentials] +azure_client_id = "client_id" # please set me up! +azure_client_secret = "client_secret" +azure_tenant_id = "tenant_id" # please set me up! +``` + #### Local file system If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required) @@ -458,4 +478,4 @@ managed in the regular way by the final destination you have configured. You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations. - \ No newline at end of file + diff --git a/tests/load/filesystem/test_azure_credentials.py b/tests/load/filesystem/test_azure_credentials.py index 467ba55a4f..4ee2ec46db 100644 --- a/tests/load/filesystem/test_azure_credentials.py +++ b/tests/load/filesystem/test_azure_credentials.py @@ -1,15 +1,24 @@ -from typing import Dict +from typing import Dict, Optional from urllib.parse import parse_qs +from uuid import uuid4 import pytest +import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime from dlt.common.configuration import resolve_configuration, ConfigFieldMissingException -from dlt.common.configuration.specs import AzureCredentials -from tests.load.utils import ALL_FILESYSTEM_DRIVERS +from dlt.common.configuration.specs import ( + AzureCredentials, + AzureServicePrincipalCredentials, + AzureServicePrincipalCredentialsWithoutDefaults, + AzureCredentialsWithoutDefaults, +) +from dlt.common.storages.configuration import FilesystemConfiguration +from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AZ_BUCKET from tests.common.configuration.utils import environment from tests.utils import preserve_environ, autouse_test_storage +from dlt.common.storages.fsspec_filesystem import fsspec_from_config # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -18,6 +27,27 @@ pytest.skip("az filesystem driver not configured", allow_module_level=True) +@pytest.fixture +def az_service_principal_config() -> Optional[FilesystemConfiguration]: + """FS config with alternate azure credentials format if available in environment + + Working credentials of this type may be created as an app in Entra, which has + R/W/E access to the bucket (via ACL of particular container) + + """ + credentials = AzureServicePrincipalCredentialsWithoutDefaults( + azure_tenant_id=dlt.config.get("tests.az_sp_tenant_id", str), + azure_client_id=dlt.config.get("tests.az_sp_client_id", str), + azure_client_secret=dlt.config.get("tests.az_sp_client_secret", str), # type: ignore[arg-type] + azure_storage_account_name=dlt.config.get("tests.az_sp_storage_account_name", str), + ) + # + credentials = resolve_configuration(credentials, sections=("destination", "fsazureprincipal")) + cfg = FilesystemConfiguration(bucket_url=AZ_BUCKET, credentials=credentials) + + return resolve_configuration(cfg) + + def test_azure_credentials_from_account_key(environment: Dict[str, str]) -> None: environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890" @@ -95,3 +125,77 @@ def test_azure_credentials_from_default(environment: Dict[str, str]) -> None: "sas_token": None, "anon": False, } + + +def test_azure_service_principal_credentials(environment: Dict[str, str]) -> None: + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id" + environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "fake_client_secret" + environment["CREDENTIALS__AZURE_TENANT_ID"] = "fake_tenant_id" + + config = resolve_configuration(AzureServicePrincipalCredentials()) + + assert config.azure_client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"] + assert config.azure_client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"] + assert config.azure_tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"] + + assert config.to_adlfs_credentials() == { + "account_name": environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"], + "client_id": environment["CREDENTIALS__AZURE_CLIENT_ID"], + "client_secret": environment["CREDENTIALS__AZURE_CLIENT_SECRET"], + "tenant_id": environment["CREDENTIALS__AZURE_TENANT_ID"], + } + + +def test_azure_filesystem_configuration_service_principal(environment: Dict[str, str]) -> None: + """Filesystem config resolves correct credentials type""" + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id" + environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "asdsadas" + environment["CREDENTIALS__AZURE_TENANT_ID"] = str(uuid4()) + + config = FilesystemConfiguration(bucket_url="az://my-bucket") + + resolved_config = resolve_configuration(config) + + assert isinstance(resolved_config.credentials, AzureServicePrincipalCredentialsWithoutDefaults) + + fs, bucket = fsspec_from_config(resolved_config) + + assert fs.tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"] + assert fs.client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"] + assert fs.client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"] + + +def test_azure_filesystem_configuration_sas_token(environment: Dict[str, str]) -> None: + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] = ( + "sp=rwdlacx&se=2021-01-01T00:00:00Z&sv=2019-12-12&sr=c&sig=1234567890" + ) + + config = FilesystemConfiguration(bucket_url="az://my-bucket") + + resolved_config = resolve_configuration(config) + + assert isinstance(resolved_config.credentials, AzureCredentialsWithoutDefaults) + + fs, bucket = fsspec_from_config(resolved_config) + + assert fs.sas_token == "?" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] + assert fs.account_name == environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] + + +def test_azure_service_principal_fs_operations( + az_service_principal_config: Optional[FilesystemConfiguration], +) -> None: + """Test connecting to azure filesystem with service principal credentials""" + config = az_service_principal_config + fs, bucket = fsspec_from_config(config) + + fn = uuid4().hex + # Try some file ops to see if the credentials work + fs.touch(f"{bucket}/{fn}/{fn}") + files = fs.ls(f"{bucket}/{fn}") + assert f"{bucket}/{fn}/{fn}" in files + fs.delete(f"{bucket}/{fn}/{fn}") + fs.rmdir(f"{bucket}/{fn}") diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index 3677765c9f..c069f88a15 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -12,10 +12,7 @@ from dlt.common import json, pendulum from dlt.common.configuration import resolve from dlt.common.configuration.inject import with_config -from dlt.common.configuration.specs import ( - AzureCredentials, - AzureCredentialsWithoutDefaults, -) +from dlt.common.configuration.specs import AnyAzureCredentials from dlt.common.storages import fsspec_from_config, FilesystemConfiguration from dlt.common.storages.fsspec_filesystem import MTIME_DISPATCH, glob_files from dlt.common.utils import custom_environ, uniq_id @@ -43,10 +40,7 @@ def test_filesystem_configuration() -> None: config = FilesystemConfiguration(bucket_url="az://root") assert config.protocol == "az" # print(config.resolve_credentials_type()) - assert ( - config.resolve_credentials_type() - == Union[AzureCredentialsWithoutDefaults, AzureCredentials] - ) + assert config.resolve_credentials_type() == AnyAzureCredentials assert dict(config) == { "read_only": False, "bucket_url": "az://root", From 4fcfa28da3891b4862f8ec466d33159cdde70f2e Mon Sep 17 00:00:00 2001 From: rudolfix Date: Mon, 27 May 2024 15:03:50 +0200 Subject: [PATCH 15/15] RESTClient: implement AuthConfigBase.__bool__ + update docs (#1413) * Fix AuthConfigBase so its instances always evaluate to True in bool context; change docs to suggest direct inheritance from AuthBase * Add tests * Fix formatting * uses AuthBase as auth type --------- Co-authored-by: Anton Burnashev --- dlt/sources/helpers/rest_client/auth.py | 6 ++- dlt/sources/helpers/rest_client/client.py | 14 +++--- .../docs/general-usage/http/rest-client.md | 8 ++-- .../helpers/rest_client/test_client.py | 46 ++++++++++++++++++- 4 files changed, 61 insertions(+), 13 deletions(-) diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 37c0de3db1..020c63a195 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -38,7 +38,11 @@ class AuthConfigBase(AuthBase, CredentialsConfiguration): configurable via env variables or toml files """ - pass + def __bool__(self) -> bool: + # This is needed to avoid AuthConfigBase-derived classes + # which do not implement CredentialsConfiguration interface + # to be evaluated as False in requests.sessions.Session.prepare_request() + return True @configspec diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index b4b62fa849..7d1145a890 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -6,12 +6,14 @@ Any, TypeVar, Iterable, + Union, cast, ) import copy from urllib.parse import urlparse from requests import Session as BaseSession # noqa: I251 from requests import Response, Request +from requests.auth import AuthBase from dlt.common import jsonpath, logger @@ -41,7 +43,7 @@ def __init__( request: Request, response: Response, paginator: BasePaginator, - auth: AuthConfigBase, + auth: AuthBase, ): super().__init__(__iterable) self.request = request @@ -57,7 +59,7 @@ class RESTClient: Args: base_url (str): The base URL of the API to make requests to. headers (Optional[Dict[str, str]]): Default headers to include in all requests. - auth (Optional[AuthConfigBase]): Authentication configuration for all requests. + auth (Optional[AuthBase]): Authentication configuration for all requests. paginator (Optional[BasePaginator]): Default paginator for handling paginated responses. data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for extracting data from responses. session (BaseSession): HTTP session for making requests. @@ -69,7 +71,7 @@ def __init__( self, base_url: str, headers: Optional[Dict[str, str]] = None, - auth: Optional[AuthConfigBase] = None, + auth: Optional[AuthBase] = None, paginator: Optional[BasePaginator] = None, data_selector: Optional[jsonpath.TJsonPath] = None, session: BaseSession = None, @@ -105,7 +107,7 @@ def _create_request( method: HTTPMethod, params: Dict[str, Any], json: Optional[Dict[str, Any]] = None, - auth: Optional[AuthConfigBase] = None, + auth: Optional[AuthBase] = None, hooks: Optional[Hooks] = None, ) -> Request: parsed_url = urlparse(path) @@ -154,7 +156,7 @@ def paginate( method: HTTPMethodBasic = "GET", params: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, - auth: Optional[AuthConfigBase] = None, + auth: Optional[AuthBase] = None, paginator: Optional[BasePaginator] = None, data_selector: Optional[jsonpath.TJsonPath] = None, hooks: Optional[Hooks] = None, @@ -166,7 +168,7 @@ def paginate( method (HTTPMethodBasic): HTTP method for the request, defaults to 'get'. params (Optional[Dict[str, Any]]): URL parameters for the request. json (Optional[Dict[str, Any]]): JSON payload for the request. - auth (Optional[AuthConfigBase]): Authentication configuration for the request. + auth (Optional[AuthBase): Authentication configuration for the request. paginator (Optional[BasePaginator]): Paginator instance for handling pagination logic. data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 19cc95bf78..1093428b0f 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -407,7 +407,7 @@ The available authentication methods are defined in the `dlt.sources.helpers.res - [APIKeyAuth](#api-key-authentication) - [HttpBasicAuth](#http-basic-authentication) -For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthConfigBase` class. +For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthBase` class from the Requests library. ### Bearer token authentication @@ -479,12 +479,12 @@ response = client.get("/protected/resource") ### Implementing custom authentication -You can implement custom authentication by subclassing the `AuthConfigBase` class and implementing the `__call__` method: +You can implement custom authentication by subclassing the `AuthBase` class and implementing the `__call__` method: ```py -from dlt.sources.helpers.rest_client.auth import AuthConfigBase +from requests.auth import AuthBase -class CustomAuth(AuthConfigBase): +class CustomAuth(AuthBase): def __init__(self, token): self.token = token diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 50defa8edb..7f03c6d167 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -1,8 +1,10 @@ import os import pytest from typing import Any, cast +from requests import PreparedRequest, Request +from requests.auth import AuthBase from dlt.common.typing import TSecretStrValue -from dlt.sources.helpers.requests import Response, Request +from dlt.sources.helpers.requests import Response from dlt.sources.helpers.rest_client import RESTClient from dlt.sources.helpers.rest_client.client import Hooks from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator @@ -57,7 +59,6 @@ def test_page_context(self, rest_client: RESTClient) -> None: for page in rest_client.paginate( "/posts", paginator=JSONResponsePaginator(next_url_path="next_page"), - auth=AuthConfigBase(), ): # response that produced data assert isinstance(page.response, Response) @@ -183,3 +184,44 @@ def test_oauth_jwt_auth_success(self, rest_client: RESTClient): ) assert_pagination(list(pages_iter)) + + def test_custom_auth_success(self, rest_client: RESTClient): + class CustomAuthConfigBase(AuthConfigBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + class CustomAuthAuthBase(AuthBase): + def __init__(self, token: str): + self.token = token + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + request.headers["Authorization"] = f"Bearer {self.token}" + return request + + auth_list = [ + CustomAuthConfigBase("test-token"), + CustomAuthAuthBase("test-token"), + ] + + for auth in auth_list: + response = rest_client.get( + "/protected/posts/bearer-token", + auth=auth, + ) + + assert response.status_code == 200 + assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + + pages_iter = rest_client.paginate( + "/protected/posts/bearer-token", + auth=auth, + ) + + pages_list = list(pages_iter) + assert_pagination(pages_list) + + assert pages_list[0].response.request.headers["Authorization"] == "Bearer test-token"