Skip to content

Commit

Permalink
Azure service principal credentials (#1377)
Browse files Browse the repository at this point in the history
* Support azure service principal credentials

* Add sp credentials to docs

* Test resolved fsspec instance

* Update filesystem config test

* Test connect with service credentials

* configures destination credentials

* fixes toml typo

---------

Co-authored-by: Marcin Rudolf <[email protected]>
  • Loading branch information
steinitzu and rudolfix authored May 25, 2024
1 parent 67ccacd commit 3dc1874
Show file tree
Hide file tree
Showing 6 changed files with 191 additions and 25 deletions.
11 changes: 10 additions & 1 deletion dlt/common/configuration/specs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@
from .connection_string_credentials import ConnectionStringCredentials
from .api_credentials import OAuth2Credentials
from .aws_credentials import AwsCredentials, AwsCredentialsWithoutDefaults
from .azure_credentials import AzureCredentials, AzureCredentialsWithoutDefaults
from .azure_credentials import (
AzureCredentials,
AzureCredentialsWithoutDefaults,
AzureServicePrincipalCredentials,
AzureServicePrincipalCredentialsWithoutDefaults,
AnyAzureCredentials,
)


# backward compatibility for service account credentials
Expand Down Expand Up @@ -51,6 +57,9 @@
"AwsCredentialsWithoutDefaults",
"AzureCredentials",
"AzureCredentialsWithoutDefaults",
"AzureServicePrincipalCredentials",
"AzureServicePrincipalCredentialsWithoutDefaults",
"AnyAzureCredentials",
"GcpClientCredentials",
"GcpClientCredentialsWithDefault",
]
50 changes: 45 additions & 5 deletions dlt/common/configuration/specs/azure_credentials.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Dict, Any
from typing import Optional, Dict, Any, Union

from dlt.common.pendulum import pendulum
from dlt.common.typing import TSecretStrValue
Expand All @@ -7,10 +7,6 @@
CredentialsWithDefault,
configspec,
)
from dlt.common.configuration.specs.exceptions import InvalidBoto3Session
from dlt import version

import fsspec


@configspec
Expand Down Expand Up @@ -50,6 +46,22 @@ def on_partial(self) -> None:
self.resolve()


@configspec
class AzureServicePrincipalCredentialsWithoutDefaults(CredentialsConfiguration):
azure_storage_account_name: str = None
azure_tenant_id: str = None
azure_client_id: str = None
azure_client_secret: TSecretStrValue = None

def to_adlfs_credentials(self) -> Dict[str, Any]:
return dict(
account_name=self.azure_storage_account_name,
tenant_id=self.azure_tenant_id,
client_id=self.azure_client_id,
client_secret=self.azure_client_secret,
)


@configspec
class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault):
def on_partial(self) -> None:
Expand All @@ -67,3 +79,31 @@ def to_adlfs_credentials(self) -> Dict[str, Any]:
if self.has_default_credentials():
base_kwargs["anon"] = False
return base_kwargs


@configspec
class AzureServicePrincipalCredentials(
AzureServicePrincipalCredentialsWithoutDefaults, CredentialsWithDefault
):
def on_partial(self) -> None:
from azure.identity import DefaultAzureCredential

self._set_default_credentials(DefaultAzureCredential())
if self.azure_storage_account_name:
self.resolve()

def to_adlfs_credentials(self) -> Dict[str, Any]:
base_kwargs = super().to_adlfs_credentials()
if self.has_default_credentials():
base_kwargs["anon"] = False
return base_kwargs


AnyAzureCredentials = Union[
# Credentials without defaults come first because union types are attempted in order
# and explicit config should supersede system defaults
AzureCredentialsWithoutDefaults,
AzureServicePrincipalCredentialsWithoutDefaults,
AzureCredentials,
AzureServicePrincipalCredentials,
]
11 changes: 5 additions & 6 deletions dlt/common/storages/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
GcpServiceAccountCredentials,
AwsCredentials,
GcpOAuthCredentials,
AzureCredentials,
AzureCredentialsWithoutDefaults,
AnyAzureCredentials,
BaseConfiguration,
)
from dlt.common.typing import DictStrAny
Expand Down Expand Up @@ -49,7 +48,7 @@ class LoadStorageConfiguration(BaseConfiguration):


FileSystemCredentials = Union[
AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials
AwsCredentials, GcpServiceAccountCredentials, AnyAzureCredentials, GcpOAuthCredentials
]


Expand All @@ -70,9 +69,9 @@ class FilesystemConfiguration(BaseConfiguration):
"gcs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials],
"gdrive": Union[GcpServiceAccountCredentials, GcpOAuthCredentials],
"s3": AwsCredentials,
"az": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"abfs": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"adl": Union[AzureCredentialsWithoutDefaults, AzureCredentials],
"az": AnyAzureCredentials,
"abfs": AnyAzureCredentials,
"adl": AnyAzureCredentials,
}

bucket_url: str = None
Expand Down
24 changes: 22 additions & 2 deletions docs/website/docs/dlt-ecosystem/destinations/filesystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,13 @@ Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Ob
#### Azure Blob Storage
Run `pip install "dlt[az]"` which will install the `adlfs` package to interface with Azure Blob Storage.

Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials:
Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials.

Two forms of Azure credentials are supported:

##### SAS token credentials

Supply storage account name and either sas token or storage account key

```toml
[destination.filesystem]
Expand All @@ -168,6 +174,20 @@ If you have the correct Azure credentials set up on your machine (e.g. via azure
you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fall back to the available default.
Note that `azure_storage_account_name` is still required as it can't be inferred from the environment.

##### Service principal credentials

Supply a client ID, client secret and a tenant ID for a service principal authorized to access your container

```toml
[destination.filesystem]
bucket_url = "az://[your_container name]" # replace with your container name

[destination.filesystem.credentials]
azure_client_id = "client_id" # please set me up!
azure_client_secret = "client_secret"
azure_tenant_id = "tenant_id" # please set me up!
```

#### Local file system
If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required)

Expand Down Expand Up @@ -458,4 +478,4 @@ managed in the regular way by the final destination you have configured.
You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables
in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations.

<!--@@@DLT_TUBA filesystem-->
<!--@@@DLT_TUBA filesystem-->
110 changes: 107 additions & 3 deletions tests/load/filesystem/test_azure_credentials.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
from typing import Dict
from typing import Dict, Optional
from urllib.parse import parse_qs
from uuid import uuid4

import pytest

import dlt
from dlt.common import pendulum
from dlt.common.time import ensure_pendulum_datetime
from dlt.common.configuration import resolve_configuration, ConfigFieldMissingException
from dlt.common.configuration.specs import AzureCredentials
from tests.load.utils import ALL_FILESYSTEM_DRIVERS
from dlt.common.configuration.specs import (
AzureCredentials,
AzureServicePrincipalCredentials,
AzureServicePrincipalCredentialsWithoutDefaults,
AzureCredentialsWithoutDefaults,
)
from dlt.common.storages.configuration import FilesystemConfiguration
from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AZ_BUCKET
from tests.common.configuration.utils import environment
from tests.utils import preserve_environ, autouse_test_storage
from dlt.common.storages.fsspec_filesystem import fsspec_from_config

# mark all tests as essential, do not remove
pytestmark = pytest.mark.essential
Expand All @@ -18,6 +27,27 @@
pytest.skip("az filesystem driver not configured", allow_module_level=True)


@pytest.fixture
def az_service_principal_config() -> Optional[FilesystemConfiguration]:
"""FS config with alternate azure credentials format if available in environment
Working credentials of this type may be created as an app in Entra, which has
R/W/E access to the bucket (via ACL of particular container)
"""
credentials = AzureServicePrincipalCredentialsWithoutDefaults(
azure_tenant_id=dlt.config.get("tests.az_sp_tenant_id", str),
azure_client_id=dlt.config.get("tests.az_sp_client_id", str),
azure_client_secret=dlt.config.get("tests.az_sp_client_secret", str), # type: ignore[arg-type]
azure_storage_account_name=dlt.config.get("tests.az_sp_storage_account_name", str),
)
#
credentials = resolve_configuration(credentials, sections=("destination", "fsazureprincipal"))
cfg = FilesystemConfiguration(bucket_url=AZ_BUCKET, credentials=credentials)

return resolve_configuration(cfg)


def test_azure_credentials_from_account_key(environment: Dict[str, str]) -> None:
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name"
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890"
Expand Down Expand Up @@ -95,3 +125,77 @@ def test_azure_credentials_from_default(environment: Dict[str, str]) -> None:
"sas_token": None,
"anon": False,
}


def test_azure_service_principal_credentials(environment: Dict[str, str]) -> None:
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name"
environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id"
environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "fake_client_secret"
environment["CREDENTIALS__AZURE_TENANT_ID"] = "fake_tenant_id"

config = resolve_configuration(AzureServicePrincipalCredentials())

assert config.azure_client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"]
assert config.azure_client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"]
assert config.azure_tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"]

assert config.to_adlfs_credentials() == {
"account_name": environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"],
"client_id": environment["CREDENTIALS__AZURE_CLIENT_ID"],
"client_secret": environment["CREDENTIALS__AZURE_CLIENT_SECRET"],
"tenant_id": environment["CREDENTIALS__AZURE_TENANT_ID"],
}


def test_azure_filesystem_configuration_service_principal(environment: Dict[str, str]) -> None:
"""Filesystem config resolves correct credentials type"""
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name"
environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id"
environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "asdsadas"
environment["CREDENTIALS__AZURE_TENANT_ID"] = str(uuid4())

config = FilesystemConfiguration(bucket_url="az://my-bucket")

resolved_config = resolve_configuration(config)

assert isinstance(resolved_config.credentials, AzureServicePrincipalCredentialsWithoutDefaults)

fs, bucket = fsspec_from_config(resolved_config)

assert fs.tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"]
assert fs.client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"]
assert fs.client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"]


def test_azure_filesystem_configuration_sas_token(environment: Dict[str, str]) -> None:
environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name"
environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] = (
"sp=rwdlacx&se=2021-01-01T00:00:00Z&sv=2019-12-12&sr=c&sig=1234567890"
)

config = FilesystemConfiguration(bucket_url="az://my-bucket")

resolved_config = resolve_configuration(config)

assert isinstance(resolved_config.credentials, AzureCredentialsWithoutDefaults)

fs, bucket = fsspec_from_config(resolved_config)

assert fs.sas_token == "?" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"]
assert fs.account_name == environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"]


def test_azure_service_principal_fs_operations(
az_service_principal_config: Optional[FilesystemConfiguration],
) -> None:
"""Test connecting to azure filesystem with service principal credentials"""
config = az_service_principal_config
fs, bucket = fsspec_from_config(config)

fn = uuid4().hex
# Try some file ops to see if the credentials work
fs.touch(f"{bucket}/{fn}/{fn}")
files = fs.ls(f"{bucket}/{fn}")
assert f"{bucket}/{fn}/{fn}" in files
fs.delete(f"{bucket}/{fn}/{fn}")
fs.rmdir(f"{bucket}/{fn}")
10 changes: 2 additions & 8 deletions tests/load/filesystem/test_filesystem_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@
from dlt.common import json, pendulum
from dlt.common.configuration import resolve
from dlt.common.configuration.inject import with_config
from dlt.common.configuration.specs import (
AzureCredentials,
AzureCredentialsWithoutDefaults,
)
from dlt.common.configuration.specs import AnyAzureCredentials
from dlt.common.storages import fsspec_from_config, FilesystemConfiguration
from dlt.common.storages.fsspec_filesystem import MTIME_DISPATCH, glob_files
from dlt.common.utils import custom_environ, uniq_id
Expand Down Expand Up @@ -43,10 +40,7 @@ def test_filesystem_configuration() -> None:
config = FilesystemConfiguration(bucket_url="az://root")
assert config.protocol == "az"
# print(config.resolve_credentials_type())
assert (
config.resolve_credentials_type()
== Union[AzureCredentialsWithoutDefaults, AzureCredentials]
)
assert config.resolve_credentials_type() == AnyAzureCredentials
assert dict(config) == {
"read_only": False,
"bucket_url": "az://root",
Expand Down

0 comments on commit 3dc1874

Please sign in to comment.