From 3dc187475546e209be7d3025f4378d43e253a5bd Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Sat, 25 May 2024 11:28:12 -0400 Subject: [PATCH] Azure service principal credentials (#1377) * Support azure service principal credentials * Add sp credentials to docs * Test resolved fsspec instance * Update filesystem config test * Test connect with service credentials * configures destination credentials * fixes toml typo --------- Co-authored-by: Marcin Rudolf --- dlt/common/configuration/specs/__init__.py | 11 +- .../configuration/specs/azure_credentials.py | 50 +++++++- dlt/common/storages/configuration.py | 11 +- .../dlt-ecosystem/destinations/filesystem.md | 24 +++- .../load/filesystem/test_azure_credentials.py | 110 +++++++++++++++++- .../load/filesystem/test_filesystem_common.py | 10 +- 6 files changed, 191 insertions(+), 25 deletions(-) diff --git a/dlt/common/configuration/specs/__init__.py b/dlt/common/configuration/specs/__init__.py index 9acf14bde3..f1d7d819ff 100644 --- a/dlt/common/configuration/specs/__init__.py +++ b/dlt/common/configuration/specs/__init__.py @@ -20,7 +20,13 @@ from .connection_string_credentials import ConnectionStringCredentials from .api_credentials import OAuth2Credentials from .aws_credentials import AwsCredentials, AwsCredentialsWithoutDefaults -from .azure_credentials import AzureCredentials, AzureCredentialsWithoutDefaults +from .azure_credentials import ( + AzureCredentials, + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentials, + AzureServicePrincipalCredentialsWithoutDefaults, + AnyAzureCredentials, +) # backward compatibility for service account credentials @@ -51,6 +57,9 @@ "AwsCredentialsWithoutDefaults", "AzureCredentials", "AzureCredentialsWithoutDefaults", + "AzureServicePrincipalCredentials", + "AzureServicePrincipalCredentialsWithoutDefaults", + "AnyAzureCredentials", "GcpClientCredentials", "GcpClientCredentialsWithDefault", ] diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index 52d33ec0d3..8b8fc259f2 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Union from dlt.common.pendulum import pendulum from dlt.common.typing import TSecretStrValue @@ -7,10 +7,6 @@ CredentialsWithDefault, configspec, ) -from dlt.common.configuration.specs.exceptions import InvalidBoto3Session -from dlt import version - -import fsspec @configspec @@ -50,6 +46,22 @@ def on_partial(self) -> None: self.resolve() +@configspec +class AzureServicePrincipalCredentialsWithoutDefaults(CredentialsConfiguration): + azure_storage_account_name: str = None + azure_tenant_id: str = None + azure_client_id: str = None + azure_client_secret: TSecretStrValue = None + + def to_adlfs_credentials(self) -> Dict[str, Any]: + return dict( + account_name=self.azure_storage_account_name, + tenant_id=self.azure_tenant_id, + client_id=self.azure_client_id, + client_secret=self.azure_client_secret, + ) + + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): def on_partial(self) -> None: @@ -67,3 +79,31 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: if self.has_default_credentials(): base_kwargs["anon"] = False return base_kwargs + + +@configspec +class AzureServicePrincipalCredentials( + AzureServicePrincipalCredentialsWithoutDefaults, CredentialsWithDefault +): + def on_partial(self) -> None: + from azure.identity import DefaultAzureCredential + + self._set_default_credentials(DefaultAzureCredential()) + if self.azure_storage_account_name: + self.resolve() + + def to_adlfs_credentials(self) -> Dict[str, Any]: + base_kwargs = super().to_adlfs_credentials() + if self.has_default_credentials(): + base_kwargs["anon"] = False + return base_kwargs + + +AnyAzureCredentials = Union[ + # Credentials without defaults come first because union types are attempted in order + # and explicit config should supersede system defaults + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentialsWithoutDefaults, + AzureCredentials, + AzureServicePrincipalCredentials, +] diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index a1838fab6e..6e100536af 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -10,8 +10,7 @@ GcpServiceAccountCredentials, AwsCredentials, GcpOAuthCredentials, - AzureCredentials, - AzureCredentialsWithoutDefaults, + AnyAzureCredentials, BaseConfiguration, ) from dlt.common.typing import DictStrAny @@ -49,7 +48,7 @@ class LoadStorageConfiguration(BaseConfiguration): FileSystemCredentials = Union[ - AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials + AwsCredentials, GcpServiceAccountCredentials, AnyAzureCredentials, GcpOAuthCredentials ] @@ -70,9 +69,9 @@ class FilesystemConfiguration(BaseConfiguration): "gcs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], "gdrive": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], "s3": AwsCredentials, - "az": Union[AzureCredentialsWithoutDefaults, AzureCredentials], - "abfs": Union[AzureCredentialsWithoutDefaults, AzureCredentials], - "adl": Union[AzureCredentialsWithoutDefaults, AzureCredentials], + "az": AnyAzureCredentials, + "abfs": AnyAzureCredentials, + "adl": AnyAzureCredentials, } bucket_url: str = None diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 4c62e172d8..3e2e08013c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -150,7 +150,13 @@ Use **Cloud Storage** admin to create a new bucket. Then assign the **Storage Ob #### Azure Blob Storage Run `pip install "dlt[az]"` which will install the `adlfs` package to interface with Azure Blob Storage. -Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials: +Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default replace them with your Azure credentials. + +Two forms of Azure credentials are supported: + +##### SAS token credentials + +Supply storage account name and either sas token or storage account key ```toml [destination.filesystem] @@ -168,6 +174,20 @@ If you have the correct Azure credentials set up on your machine (e.g. via azure you can omit both `azure_storage_account_key` and `azure_storage_sas_token` and `dlt` will fall back to the available default. Note that `azure_storage_account_name` is still required as it can't be inferred from the environment. +##### Service principal credentials + +Supply a client ID, client secret and a tenant ID for a service principal authorized to access your container + +```toml +[destination.filesystem] +bucket_url = "az://[your_container name]" # replace with your container name + +[destination.filesystem.credentials] +azure_client_id = "client_id" # please set me up! +azure_client_secret = "client_secret" +azure_tenant_id = "tenant_id" # please set me up! +``` + #### Local file system If for any reason you want to have those files in a local folder, set up the `bucket_url` as follows (you are free to use `config.toml` for that as there are no secrets required) @@ -458,4 +478,4 @@ managed in the regular way by the final destination you have configured. You will also notice `init` files being present in the root folder and the special `dlt` folders. In the absence of the concepts of schemas and tables in blob storages and directories, `dlt` uses these special files to harmonize the behavior of the `filesystem` destination with the other implemented destinations. - \ No newline at end of file + diff --git a/tests/load/filesystem/test_azure_credentials.py b/tests/load/filesystem/test_azure_credentials.py index 467ba55a4f..4ee2ec46db 100644 --- a/tests/load/filesystem/test_azure_credentials.py +++ b/tests/load/filesystem/test_azure_credentials.py @@ -1,15 +1,24 @@ -from typing import Dict +from typing import Dict, Optional from urllib.parse import parse_qs +from uuid import uuid4 import pytest +import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime from dlt.common.configuration import resolve_configuration, ConfigFieldMissingException -from dlt.common.configuration.specs import AzureCredentials -from tests.load.utils import ALL_FILESYSTEM_DRIVERS +from dlt.common.configuration.specs import ( + AzureCredentials, + AzureServicePrincipalCredentials, + AzureServicePrincipalCredentialsWithoutDefaults, + AzureCredentialsWithoutDefaults, +) +from dlt.common.storages.configuration import FilesystemConfiguration +from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AZ_BUCKET from tests.common.configuration.utils import environment from tests.utils import preserve_environ, autouse_test_storage +from dlt.common.storages.fsspec_filesystem import fsspec_from_config # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -18,6 +27,27 @@ pytest.skip("az filesystem driver not configured", allow_module_level=True) +@pytest.fixture +def az_service_principal_config() -> Optional[FilesystemConfiguration]: + """FS config with alternate azure credentials format if available in environment + + Working credentials of this type may be created as an app in Entra, which has + R/W/E access to the bucket (via ACL of particular container) + + """ + credentials = AzureServicePrincipalCredentialsWithoutDefaults( + azure_tenant_id=dlt.config.get("tests.az_sp_tenant_id", str), + azure_client_id=dlt.config.get("tests.az_sp_client_id", str), + azure_client_secret=dlt.config.get("tests.az_sp_client_secret", str), # type: ignore[arg-type] + azure_storage_account_name=dlt.config.get("tests.az_sp_storage_account_name", str), + ) + # + credentials = resolve_configuration(credentials, sections=("destination", "fsazureprincipal")) + cfg = FilesystemConfiguration(bucket_url=AZ_BUCKET, credentials=credentials) + + return resolve_configuration(cfg) + + def test_azure_credentials_from_account_key(environment: Dict[str, str]) -> None: environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890" @@ -95,3 +125,77 @@ def test_azure_credentials_from_default(environment: Dict[str, str]) -> None: "sas_token": None, "anon": False, } + + +def test_azure_service_principal_credentials(environment: Dict[str, str]) -> None: + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id" + environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "fake_client_secret" + environment["CREDENTIALS__AZURE_TENANT_ID"] = "fake_tenant_id" + + config = resolve_configuration(AzureServicePrincipalCredentials()) + + assert config.azure_client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"] + assert config.azure_client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"] + assert config.azure_tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"] + + assert config.to_adlfs_credentials() == { + "account_name": environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"], + "client_id": environment["CREDENTIALS__AZURE_CLIENT_ID"], + "client_secret": environment["CREDENTIALS__AZURE_CLIENT_SECRET"], + "tenant_id": environment["CREDENTIALS__AZURE_TENANT_ID"], + } + + +def test_azure_filesystem_configuration_service_principal(environment: Dict[str, str]) -> None: + """Filesystem config resolves correct credentials type""" + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_CLIENT_ID"] = "fake_client_id" + environment["CREDENTIALS__AZURE_CLIENT_SECRET"] = "asdsadas" + environment["CREDENTIALS__AZURE_TENANT_ID"] = str(uuid4()) + + config = FilesystemConfiguration(bucket_url="az://my-bucket") + + resolved_config = resolve_configuration(config) + + assert isinstance(resolved_config.credentials, AzureServicePrincipalCredentialsWithoutDefaults) + + fs, bucket = fsspec_from_config(resolved_config) + + assert fs.tenant_id == environment["CREDENTIALS__AZURE_TENANT_ID"] + assert fs.client_id == environment["CREDENTIALS__AZURE_CLIENT_ID"] + assert fs.client_secret == environment["CREDENTIALS__AZURE_CLIENT_SECRET"] + + +def test_azure_filesystem_configuration_sas_token(environment: Dict[str, str]) -> None: + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] = ( + "sp=rwdlacx&se=2021-01-01T00:00:00Z&sv=2019-12-12&sr=c&sig=1234567890" + ) + + config = FilesystemConfiguration(bucket_url="az://my-bucket") + + resolved_config = resolve_configuration(config) + + assert isinstance(resolved_config.credentials, AzureCredentialsWithoutDefaults) + + fs, bucket = fsspec_from_config(resolved_config) + + assert fs.sas_token == "?" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] + assert fs.account_name == environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] + + +def test_azure_service_principal_fs_operations( + az_service_principal_config: Optional[FilesystemConfiguration], +) -> None: + """Test connecting to azure filesystem with service principal credentials""" + config = az_service_principal_config + fs, bucket = fsspec_from_config(config) + + fn = uuid4().hex + # Try some file ops to see if the credentials work + fs.touch(f"{bucket}/{fn}/{fn}") + files = fs.ls(f"{bucket}/{fn}") + assert f"{bucket}/{fn}/{fn}" in files + fs.delete(f"{bucket}/{fn}/{fn}") + fs.rmdir(f"{bucket}/{fn}") diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index 3677765c9f..c069f88a15 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -12,10 +12,7 @@ from dlt.common import json, pendulum from dlt.common.configuration import resolve from dlt.common.configuration.inject import with_config -from dlt.common.configuration.specs import ( - AzureCredentials, - AzureCredentialsWithoutDefaults, -) +from dlt.common.configuration.specs import AnyAzureCredentials from dlt.common.storages import fsspec_from_config, FilesystemConfiguration from dlt.common.storages.fsspec_filesystem import MTIME_DISPATCH, glob_files from dlt.common.utils import custom_environ, uniq_id @@ -43,10 +40,7 @@ def test_filesystem_configuration() -> None: config = FilesystemConfiguration(bucket_url="az://root") assert config.protocol == "az" # print(config.resolve_credentials_type()) - assert ( - config.resolve_credentials_type() - == Union[AzureCredentialsWithoutDefaults, AzureCredentials] - ) + assert config.resolve_credentials_type() == AnyAzureCredentials assert dict(config) == { "read_only": False, "bucket_url": "az://root",