Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Azure service principal credentials #1377

Merged
merged 7 commits into from
May 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion dlt/common/configuration/specs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
from .connection_string_credentials import ConnectionStringCredentials
from .api_credentials import OAuth2Credentials
from .aws_credentials import AwsCredentials, AwsCredentialsWithoutDefaults
from .azure_credentials import AzureCredentials, AzureCredentialsWithoutDefaults
from .azure_credentials import (
AzureCredentials,
AzureCredentialsWithoutDefaults,
AzureServicePrincipalCredentials,
AzureServicePrincipalCredentialsWithoutDefaults,
AnyAzureCredentials,
)


# backward compatibility for service account credentials
Expand Down Expand Up @@ -51,6 +57,9 @@
"AwsCredentialsWithoutDefaults",
"AzureCredentials",
"AzureCredentialsWithoutDefaults",
"AzureServicePrincipalCredentials",
"AzureServicePrincipalCredentialsWithoutDefaults",
"AnyAzureCredentials",
"GcpClientCredentials",
"GcpClientCredentialsWithDefault",
]
50 changes: 45 additions & 5 deletions dlt/common/configuration/specs/azure_credentials.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Dict, Any
from typing import Optional, Dict, Any, Union

from dlt.common.pendulum import pendulum
from dlt.common.typing import TSecretStrValue
Expand All @@ -7,10 +7,6 @@
CredentialsWithDefault,
configspec,
)
from dlt.common.configuration.specs.exceptions import InvalidBoto3Session
from dlt import version

import fsspec


@configspec
Expand Down Expand Up @@ -50,6 +46,22 @@ def on_partial(self) -> None:
self.resolve()


@configspec
class AzureServicePrincipalCredentialsWithoutDefaults(CredentialsConfiguration):
azure_storage_account_name: str = None
azure_tenant_id: str = None
azure_client_id: str = None
azure_client_secret: TSecretStrValue = None

def to_adlfs_credentials(self) -> Dict[str, Any]:
return dict(
account_name=self.azure_storage_account_name,
tenant_id=self.azure_tenant_id,
client_id=self.azure_client_id,
client_secret=self.azure_client_secret,
)


@configspec
class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault):
def on_partial(self) -> None:
Expand All @@ -67,3 +79,31 @@ def to_adlfs_credentials(self) -> Dict[str, Any]:
if self.has_default_credentials():
base_kwargs["anon"] = False
return base_kwargs


@configspec
class AzureServicePrincipalCredentials(
AzureServicePrincipalCredentialsWithoutDefaults, CredentialsWithDefault
):
def on_partial(self) -> None:
from azure.identity import DefaultAzureCredential

self._set_default_credentials(DefaultAzureCredential())
if self.azure_storage_account_name:
self.resolve()

def to_adlfs_credentials(self) -> Dict[str, Any]:
base_kwargs = super().to_adlfs_credentials()
if self.has_default_credentials():
base_kwargs["anon"] = False
return base_kwargs


AnyAzureCredentials = Union[
# Credentials without defaults come first because union types are attempted in order
# and explicit config should supersede system defaults
AzureCredentialsWithoutDefaults,
AzureServicePrincipalCredentialsWithoutDefaults,
AzureCredentials,
AzureServicePrincipalCredentials,
]
11 changes: 5 additions & 6 deletions dlt/common/storages/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
GcpServiceAccountCredentials,
AwsCredentials,
GcpOAuthCredentials,
AzureCredentials,
AzureCredentialsWithoutDefaults,
AnyAzureCredentials,
BaseConfiguration,
)
from dlt.common.typing import DictStrAny
Expand Down Expand Up @@ -49,7 +48,7 @@ class LoadStorageConfiguration(BaseConfiguration):


FileSystemCredentials = Union[
AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials
AwsCredentials, GcpServiceAccountCredentials, AnyAzureCredentials, GcpOAuthCredentials
]


Expand All @@ -70,9 +69,9 @@ class FilesystemConfiguration(BaseConfiguration):
"gcs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials],
"gdrive": Union[GcpServiceAccountCredentials, GcpOAuthCredentials],
"s3": AwsCredentials,
"az": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"abfs": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"adl": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"az": AnyAzureCredentials,
"abfs": AnyAzureCredentials,
"adl": AnyAzureCredentials,
}

bucket_url: str = None
Expand Down
24 changes: 22 additions & 2 deletions docs/website/docs/dlt-ecosystem/destinations/filesystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,13 @@ Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Ob
#### Azure Blob Storage
Run `pip install dlt[az]` which will install the `adlfs` package to interface with Azure Blob Storage.

Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials:
Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials.

Two forms of Azure credentials are supported:

##### SAS token credentials

Supply storage account name and either sas token or storage account key

```toml
[destination.filesystem]
Expand All @@ -168,6 +174,20 @@ If you have the correct Azure credentials set up on your machine (e.g. via azure
you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fall back to the available default.
Note that `azure_storage_account_name` is still required as it can't be inferred from the environment.

##### Service principal credentials

Supply a client ID, client secret and a tenant ID for a service principal authorized to access your container

```toml
[destination.filesystem]
bucket_url = "az://[your_container name]" # replace with your container name

[destination.filesystem.credentials]
azure_client_id = "client_id" # please set me up!
azure_client_secret = "client_secret"
azure_tenant_id = "tenant_id" # please set me up!
```

#### Local file system
If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required)

Expand Down Expand Up @@ -458,4 +478,4 @@ managed in the regular way by the final destination you have configured.
You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables
in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations.

<!--@@@DLT_TUBA filesystem-->
<!--@@@DLT_TUBA filesystem-->
110 changes: 107 additions & 3 deletions tests/load/filesystem/test_azure_credentials.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
from typing import Dict
from typing import Dict, Optional
from urllib.parse import parse_qs
from uuid import uuid4

import pytest

import dlt
from dlt.common import pendulum
from dlt.common.time import ensure_pendulum_datetime
from dlt.common.configuration import resolve_configuration, ConfigFieldMissingException
from dlt.common.configuration.specs import AzureCredentials
from tests.load.utils import ALL_FILESYSTEM_DRIVERS
from dlt.common.configuration.specs import (
AzureCredentials,
AzureServicePrincipalCredentials,
AzureServicePrincipalCredentialsWithoutDefaults,
AzureCredentialsWithoutDefaults,
)
from dlt.common.storages.configuration import FilesystemConfiguration
from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AZ_BUCKET
from tests.common.configuration.utils import environment
from tests.utils import preserve_environ, autouse_test_storage
from dlt.common.storages.fsspec_filesystem import fsspec_from_config

# mark all tests as essential, do not remove
pytestmark = pytest.mark.essential
Expand All @@ -18,6 +27,27 @@
pytest.skip("az filesystem driver not configured", allow_module_level=True)


@pytest.fixture
def az_service_principal_config() -> Optional[FilesystemConfiguration]:
"""FS config with alternate azure credentials format if available in environment

Working credentials of this type may be created as an app in Entra, which has
R/W/E access to the bucket (via ACL of particular container)

"""
credentials = AzureServicePrincipalCredentialsWithoutDefaults(
azure_tenant_id=dlt.config.get("tests.az_sp_tenant_id", str),
azure_client_id=dlt.config.get("tests.az_sp_client_id", str),
azure_client_secret=dlt.config.get("tests.az_sp_client_secret", str), # type: ignore[arg-type]
azure_storage_account_name=dlt.config.get("tests.az_sp_storage_account_name", str),
)
#
credentials = resolve_configuration(credentials, sections=("destination", "fsazureprincipal"))
cfg = FilesystemConfiguration(bucket_url=AZ_BUCKET, credentials=credentials)

return resolve_configuration(cfg)


def test_azure_credentials_from_account_key(environment: Dict[str, str]) -> None:
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name"
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890"
Expand Down Expand Up @@ -95,3 +125,77 @@ def test_azure_credentials_from_default(environment: Dict[str, str]) -> None:
"sas_token": None,
"anon": False,
}


def test_azure_service_principal_credentials(environment: Dict[str, str]) -> None:
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name"
environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id"
environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "fake_client_secret"
environment["CREDENTIALS__AZURE_TENANT_ID"] = "fake_tenant_id"

config = resolve_configuration(AzureServicePrincipalCredentials())

assert config.azure_client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"]
assert config.azure_client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"]
assert config.azure_tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"]

assert config.to_adlfs_credentials() == {
"account_name": environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"],
"client_id": environment["CREDENTIALS__AZURE_CLIENT_ID"],
"client_secret": environment["CREDENTIALS__AZURE_CLIENT_SECRET"],
"tenant_id": environment["CREDENTIALS__AZURE_TENANT_ID"],
}


def test_azure_filesystem_configuration_service_principal(environment: Dict[str, str]) -> None:
"""Filesystem config resolves correct credentials type"""
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name"
environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id"
environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "asdsadas"
environment["CREDENTIALS__AZURE_TENANT_ID"] = str(uuid4())

config = FilesystemConfiguration(bucket_url="az://my-bucket")

resolved_config = resolve_configuration(config)

assert isinstance(resolved_config.credentials, AzureServicePrincipalCredentialsWithoutDefaults)

fs, bucket = fsspec_from_config(resolved_config)

assert fs.tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"]
assert fs.client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"]
assert fs.client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"]


def test_azure_filesystem_configuration_sas_token(environment: Dict[str, str]) -> None:
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name"
environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] = (
"sp=rwdlacx&se=2021-01-01T00:00:00Z&sv=2019-12-12&sr=c&sig=1234567890"
)

config = FilesystemConfiguration(bucket_url="az://my-bucket")

resolved_config = resolve_configuration(config)

assert isinstance(resolved_config.credentials, AzureCredentialsWithoutDefaults)

fs, bucket = fsspec_from_config(resolved_config)

assert fs.sas_token == "?" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"]
assert fs.account_name == environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"]


def test_azure_service_principal_fs_operations(
az_service_principal_config: Optional[FilesystemConfiguration],
) -> None:
"""Test connecting to azure filesystem with service principal credentials"""
config = az_service_principal_config
fs, bucket = fsspec_from_config(config)

fn = uuid4().hex
# Try some file ops to see if the credentials work
fs.touch(f"{bucket}/{fn}/{fn}")
files = fs.ls(f"{bucket}/{fn}")
assert f"{bucket}/{fn}/{fn}" in files
fs.delete(f"{bucket}/{fn}/{fn}")
fs.rmdir(f"{bucket}/{fn}")
10 changes: 2 additions & 8 deletions tests/load/filesystem/test_filesystem_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@
from dlt.common import json, pendulum
from dlt.common.configuration import resolve
from dlt.common.configuration.inject import with_config
from dlt.common.configuration.specs import (
AzureCredentials,
AzureCredentialsWithoutDefaults,
)
from dlt.common.configuration.specs import AnyAzureCredentials
from dlt.common.storages import fsspec_from_config, FilesystemConfiguration
from dlt.common.storages.fsspec_filesystem import MTIME_DISPATCH, glob_files
from dlt.common.utils import custom_environ, uniq_id
Expand Down Expand Up @@ -43,10 +40,7 @@ def test_filesystem_configuration() -> None:
config = FilesystemConfiguration(bucket_url="az://root")
assert config.protocol == "az"
# print(config.resolve_credentials_type())
assert (
config.resolve_credentials_type()
== Union[AzureCredentialsWithoutDefaults, AzureCredentials]
)
assert config.resolve_credentials_type() == AnyAzureCredentials
assert dict(config) == {
"read_only": False,
"bucket_url": "az://root",
Expand Down
Loading