Skip to content

Commit

Permalink
Merge branch 'devel' into fix/incorrect-session-check
Browse files Browse the repository at this point in the history
  • Loading branch information
rudolfix authored May 27, 2024
2 parents c004a1f + 4fcfa28 commit 34f5d83
Show file tree
Hide file tree
Showing 40 changed files with 528 additions and 310 deletions.
11 changes: 10 additions & 1 deletion dlt/common/configuration/specs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
from .connection_string_credentials import ConnectionStringCredentials
from .api_credentials import OAuth2Credentials
from .aws_credentials import AwsCredentials, AwsCredentialsWithoutDefaults
from .azure_credentials import AzureCredentials, AzureCredentialsWithoutDefaults
from .azure_credentials import (
AzureCredentials,
AzureCredentialsWithoutDefaults,
AzureServicePrincipalCredentials,
AzureServicePrincipalCredentialsWithoutDefaults,
AnyAzureCredentials,
)


# backward compatibility for service account credentials
Expand Down Expand Up @@ -51,6 +57,9 @@
"AwsCredentialsWithoutDefaults",
"AzureCredentials",
"AzureCredentialsWithoutDefaults",
"AzureServicePrincipalCredentials",
"AzureServicePrincipalCredentialsWithoutDefaults",
"AnyAzureCredentials",
"GcpClientCredentials",
"GcpClientCredentialsWithDefault",
]
50 changes: 45 additions & 5 deletions dlt/common/configuration/specs/azure_credentials.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Dict, Any
from typing import Optional, Dict, Any, Union

from dlt.common.pendulum import pendulum
from dlt.common.typing import TSecretStrValue
Expand All @@ -7,10 +7,6 @@
CredentialsWithDefault,
configspec,
)
from dlt.common.configuration.specs.exceptions import InvalidBoto3Session
from dlt import version

import fsspec


@configspec
Expand Down Expand Up @@ -50,6 +46,22 @@ def on_partial(self) -> None:
self.resolve()


@configspec
class AzureServicePrincipalCredentialsWithoutDefaults(CredentialsConfiguration):
azure_storage_account_name: str = None
azure_tenant_id: str = None
azure_client_id: str = None
azure_client_secret: TSecretStrValue = None

def to_adlfs_credentials(self) -> Dict[str, Any]:
return dict(
account_name=self.azure_storage_account_name,
tenant_id=self.azure_tenant_id,
client_id=self.azure_client_id,
client_secret=self.azure_client_secret,
)


@configspec
class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault):
def on_partial(self) -> None:
Expand All @@ -67,3 +79,31 @@ def to_adlfs_credentials(self) -> Dict[str, Any]:
if self.has_default_credentials():
base_kwargs["anon"] = False
return base_kwargs


@configspec
class AzureServicePrincipalCredentials(
AzureServicePrincipalCredentialsWithoutDefaults, CredentialsWithDefault
):
def on_partial(self) -> None:
from azure.identity import DefaultAzureCredential

self._set_default_credentials(DefaultAzureCredential())
if self.azure_storage_account_name:
self.resolve()

def to_adlfs_credentials(self) -> Dict[str, Any]:
base_kwargs = super().to_adlfs_credentials()
if self.has_default_credentials():
base_kwargs["anon"] = False
return base_kwargs


AnyAzureCredentials = Union[
# Credentials without defaults come first because union types are attempted in order
# and explicit config should supersede system defaults
AzureCredentialsWithoutDefaults,
AzureServicePrincipalCredentialsWithoutDefaults,
AzureCredentials,
AzureServicePrincipalCredentials,
]
11 changes: 5 additions & 6 deletions dlt/common/storages/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
GcpServiceAccountCredentials,
AwsCredentials,
GcpOAuthCredentials,
AzureCredentials,
AzureCredentialsWithoutDefaults,
AnyAzureCredentials,
BaseConfiguration,
)
from dlt.common.typing import DictStrAny
Expand Down Expand Up @@ -49,7 +48,7 @@ class LoadStorageConfiguration(BaseConfiguration):


FileSystemCredentials = Union[
AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials
AwsCredentials, GcpServiceAccountCredentials, AnyAzureCredentials, GcpOAuthCredentials
]


Expand All @@ -70,9 +69,9 @@ class FilesystemConfiguration(BaseConfiguration):
"gcs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials],
"gdrive": Union[GcpServiceAccountCredentials, GcpOAuthCredentials],
"s3": AwsCredentials,
"az": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"abfs": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"adl": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"az": AnyAzureCredentials,
"abfs": AnyAzureCredentials,
"adl": AnyAzureCredentials,
}

bucket_url: str = None
Expand Down
2 changes: 1 addition & 1 deletion dlt/helpers/dbt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from dlt.helpers.dbt.runner import create_runner, DBTPackageRunner

DEFAULT_DBT_VERSION = ">=1.1,<1.6"
DEFAULT_DBT_VERSION = ">=1.5,<1.9"

# a map of destination names to dbt package names in case they don't match the pure destination name
DBT_DESTINATION_MAP = {
Expand Down
12 changes: 0 additions & 12 deletions dlt/helpers/dbt/dbt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
# https://stackoverflow.com/questions/48619517/call-a-click-command-from-code

import dbt.logger
from dbt.events import functions
from dbt.contracts import results as dbt_results
except ModuleNotFoundError:
raise MissingDependencyException("DBT Core", ["dbt-core"])
Expand Down Expand Up @@ -56,17 +55,6 @@ def set_path_wrapper(self: dbt.logger.LogManager, path: str) -> None:
self._file_handler.set_path(path)
_DBT_LOGGER_INITIALIZED = True

# def setup_event_logger_wrapper(log_path: str, level_override:str = None) -> None:
# global _DBT_LOGGER_INITIALIZED

# if not _DBT_LOGGER_INITIALIZED:
# functions.setup_event_logger(log_path, level.lower())
# # force log level as file is debug only
# # functions.this.FILE_LOG.setLevel(level)
# # functions.this.FILE_LOG.handlers[0].setLevel(level)
# _DBT_LOGGER_INITIALIZED = True

# dbt.main.setup_event_logger = setup_event_logger_wrapper
dbt.logger.LogManager.set_path = set_path_wrapper # type: ignore

globs = []
Expand Down
6 changes: 5 additions & 1 deletion dlt/sources/helpers/rest_client/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@ class AuthConfigBase(AuthBase, CredentialsConfiguration):
configurable via env variables or toml files
"""

pass
def __bool__(self) -> bool:
# This is needed to avoid AuthConfigBase-derived classes
# which do not implement CredentialsConfiguration interface
# to be evaluated as False in requests.sessions.Session.prepare_request()
return True


@configspec
Expand Down
14 changes: 8 additions & 6 deletions dlt/sources/helpers/rest_client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
Any,
TypeVar,
Iterable,
Union,
cast,
)
import copy
from urllib.parse import urlparse
from requests import Session as BaseSession # noqa: I251
from requests import Response, Request
from requests.auth import AuthBase

from dlt.common import jsonpath, logger

Expand Down Expand Up @@ -41,7 +43,7 @@ def __init__(
request: Request,
response: Response,
paginator: BasePaginator,
auth: AuthConfigBase,
auth: AuthBase,
):
super().__init__(__iterable)
self.request = request
Expand All @@ -57,7 +59,7 @@ class RESTClient:
Args:
base_url (str): The base URL of the API to make requests to.
headers (Optional[Dict[str, str]]): Default headers to include in all requests.
auth (Optional[AuthConfigBase]): Authentication configuration for all requests.
auth (Optional[AuthBase]): Authentication configuration for all requests.
paginator (Optional[BasePaginator]): Default paginator for handling paginated responses.
data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for extracting data from responses.
session (BaseSession): HTTP session for making requests.
Expand All @@ -69,7 +71,7 @@ def __init__(
self,
base_url: str,
headers: Optional[Dict[str, str]] = None,
auth: Optional[AuthConfigBase] = None,
auth: Optional[AuthBase] = None,
paginator: Optional[BasePaginator] = None,
data_selector: Optional[jsonpath.TJsonPath] = None,
session: BaseSession = None,
Expand Down Expand Up @@ -97,7 +99,7 @@ def _create_request(
method: HTTPMethod,
params: Dict[str, Any],
json: Optional[Dict[str, Any]] = None,
auth: Optional[AuthConfigBase] = None,
auth: Optional[AuthBase] = None,
hooks: Optional[Hooks] = None,
) -> Request:
parsed_url = urlparse(path)
Expand Down Expand Up @@ -146,7 +148,7 @@ def paginate(
method: HTTPMethodBasic = "GET",
params: Optional[Dict[str, Any]] = None,
json: Optional[Dict[str, Any]] = None,
auth: Optional[AuthConfigBase] = None,
auth: Optional[AuthBase] = None,
paginator: Optional[BasePaginator] = None,
data_selector: Optional[jsonpath.TJsonPath] = None,
hooks: Optional[Hooks] = None,
Expand All @@ -158,7 +160,7 @@ def paginate(
method (HTTPMethodBasic): HTTP method for the request, defaults to 'get'.
params (Optional[Dict[str, Any]]): URL parameters for the request.
json (Optional[Dict[str, Any]]): JSON payload for the request.
auth (Optional[AuthConfigBase]): Authentication configuration for the request.
auth (Optional[AuthBase): Authentication configuration for the request.
paginator (Optional[BasePaginator]): Paginator instance for handling
pagination logic.
data_selector (Optional[jsonpath.TJsonPath]): JSONPath selector for
Expand Down
9 changes: 7 additions & 2 deletions dlt/sources/helpers/rest_client/detector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from typing import List, Dict, Any, Tuple, Union, Optional, Callable, Iterable
from pathlib import PurePosixPath
from typing import List, Dict, Any, Tuple, Union, Callable, Iterable
from urllib.parse import urlparse

from requests import Response
Expand All @@ -25,6 +26,7 @@
"payload",
"content",
"objects",
"values",
]
)

Expand All @@ -46,7 +48,10 @@

def single_entity_path(path: str) -> bool:
"""Checks if path ends with path param indicating that single object is returned"""
return re.search(r"\{([a-zA-Z_][a-zA-Z0-9_]*)\}/?$", path) is not None
# get last path segment
name = PurePosixPath(path).name
# alphabet for a name taken from https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.3.md#fixed-fields-6
return re.search(r"\{([a-zA-Z0-9\.\-_]+)\}", name) is not None


def matches_any_pattern(key: str, patterns: Iterable[str]) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
keywords: [destination, credentials, example, bigquery, custom destination]
---
In this example, you'll find a Python script that demonstrates how to load to bigquey with the custom destination.
In this example, you'll find a Python script that demonstrates how to load to BigQuery with the custom destination.
We'll learn how to:
- use [built-in credentials](../general-usage/credentials/config_specs#gcp-credentials)
- use the [custom destination](../dlt-ecosystem/destinations/destination.md)
- Use pyarrow tables to create complex column types on bigquery
- Use bigquery `autodetect=True` for schema inference from parquet files
- Use [built-in credentials.](../general-usage/credentials/config_specs#gcp-credentials)
- Use the [custom destination.](../dlt-ecosystem/destinations/destination.md)
- Use pyarrow tables to create complex column types on BigQuery.
- Use BigQuery `autodetect=True` for schema inference from parquet files.
"""

Expand All @@ -38,7 +38,7 @@
def resource(url: str):
# load pyarrow table with pandas
table = pa.Table.from_pandas(pd.read_csv(url))
# we add a list type column to demontrate bigquery lists
# we add a list type column to demonstrate bigquery lists
table = table.append_column(
"tags",
pa.array(
Expand All @@ -57,12 +57,14 @@ def resource(url: str):
yield table


# dlt biquery custom destination
# dlt bigquery custom destination
# we can use the dlt provided credentials class
# to retrieve the gcp credentials from the secrets
@dlt.destination(name="bigquery", loader_file_format="parquet", batch_size=0)
@dlt.destination(
name="bigquery", loader_file_format="parquet", batch_size=0, naming_convention="snake_case"
)
def bigquery_insert(
items, table, credentials: GcpServiceAccountCredentials = dlt.secrets.value
items, table=BIGQUERY_TABLE_ID, credentials: GcpServiceAccountCredentials = dlt.secrets.value
) -> None:
client = bigquery.Client(
credentials.project_id, credentials.to_native_credentials(), location="US"
Expand All @@ -74,7 +76,7 @@ def bigquery_insert(
)
# since we have set the batch_size to 0, we get a filepath and can load the file directly
with open(items, "rb") as f:
load_job = client.load_table_from_file(f, BIGQUERY_TABLE_ID, job_config=job_config)
load_job = client.load_table_from_file(f, table, job_config=job_config)
load_job.result() # Waits for the job to complete.


Expand Down
4 changes: 2 additions & 2 deletions docs/website/docs/dlt-ecosystem/destinations/athena.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The Athena destination stores data as Parquet files in S3 buckets and creates [e
## Install dlt with Athena
**To install the dlt library with Athena dependencies:**
```sh
pip install dlt[athena]
pip install "dlt[athena]"
```

## Setup Guide
Expand All @@ -30,7 +30,7 @@ First, install dependencies by running:
```sh
pip install -r requirements.txt
```
or with `pip install dlt[athena]`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages.
or with `pip install "dlt[athena]"`, which will install `s3fs`, `pyarrow`, `pyathena`, and `botocore` packages.

:::caution

Expand Down
2 changes: 1 addition & 1 deletion docs/website/docs/dlt-ecosystem/destinations/bigquery.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ keywords: [bigquery, destination, data warehouse]
**To install the dlt library with BigQuery dependencies:**

```sh
pip install dlt[bigquery]
pip install "dlt[bigquery]"
```

## Setup Guide
Expand Down
4 changes: 2 additions & 2 deletions docs/website/docs/dlt-ecosystem/destinations/clickhouse.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ keywords: [ clickhouse, destination, data warehouse ]
**To install the DLT library with ClickHouse dependencies:**

```sh
pip install dlt[clickhouse]
pip install "dlt[clickhouse]"
```

## Setup Guide
Expand All @@ -33,7 +33,7 @@ requirements file by executing it as follows:
pip install -r requirements.txt
```

or with `pip install dlt[clickhouse]`, which installs the `dlt` library and the necessary dependencies for working with ClickHouse as a destination.
or with `pip install "dlt[clickhouse]"`, which installs the `dlt` library and the necessary dependencies for working with ClickHouse as a destination.

### 2. Setup ClickHouse database

Expand Down
2 changes: 1 addition & 1 deletion docs/website/docs/dlt-ecosystem/destinations/databricks.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ keywords: [Databricks, destination, data warehouse]
## Install dlt with Databricks
**To install the dlt library with Databricks dependencies:**
```sh
pip install dlt[databricks]
pip install "dlt[databricks]"
```

## Set up your Databricks workspace
Expand Down
Loading

0 comments on commit 34f5d83

Please sign in to comment.