From 1714801ca2f4b0de34c74fdc9fb1f826de28d45f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Willi=20M=C3=BCller?= Date: Sun, 15 Sep 2024 00:00:23 +0530 Subject: [PATCH] Masks secrets in traces. (#1797) * Masks secrets in traces. * tests that secrets are masked in stringified trace * generates secrets in deployments from dlt.secrets provider instead of pipeline trace * corrects masking and looks up secret value in dlt.secrets * removes secret masking and replaces credentials with None. * fixes deploy help when deploy type missing * fixes always_choose restore defaults in echo * tests deploy command with and without secrets * fixes dumping secret vals for toml in deploy --------- Co-authored-by: Marcin Rudolf --- dlt/cli/_dlt.py | 18 ++++++---- dlt/cli/deploy_command.py | 9 ++--- dlt/cli/deploy_command_helpers.py | 45 +++++++++++++++++++++-- dlt/cli/echo.py | 8 +++-- dlt/pipeline/trace.py | 8 ++--- tests/cli/test_deploy_command.py | 52 +++++++++++++++------------ tests/common/configuration/utils.py | 17 ++------- tests/pipeline/test_pipeline_trace.py | 9 ++--- tests/utils.py | 24 ++++++++++++- 9 files changed, 124 insertions(+), 66 deletions(-) diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index 7513ce7c64..0a4a86b9de 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -617,13 +617,17 @@ def main() -> int: elif args.command == "deploy": try: deploy_args = vars(args) - return deploy_command_wrapper( - pipeline_script_path=deploy_args.pop("pipeline_script_path"), - deployment_method=deploy_args.pop("deployment_method"), - repo_location=deploy_args.pop("location"), - branch=deploy_args.pop("branch"), - **deploy_args, - ) + if deploy_args.get("deployment_method") is None: + print_help(deploy_cmd) + return -1 + else: + return deploy_command_wrapper( + pipeline_script_path=deploy_args.pop("pipeline_script_path"), + deployment_method=deploy_args.pop("deployment_method"), + repo_location=deploy_args.pop("location"), + branch=deploy_args.pop("branch"), + **deploy_args, + ) except (NameError, KeyError): fmt.warning( "Please install additional command line dependencies to use deploy command:" diff --git a/dlt/cli/deploy_command.py b/dlt/cli/deploy_command.py index 5a25752a6d..b48dffa881 100644 --- a/dlt/cli/deploy_command.py +++ b/dlt/cli/deploy_command.py @@ -4,7 +4,7 @@ from enum import Enum from importlib.metadata import version as pkg_version -from dlt.common.configuration.providers import SECRETS_TOML, SECRETS_TOML_KEY, StringTomlProvider +from dlt.common.configuration.providers import SECRETS_TOML, SECRETS_TOML_KEY from dlt.common.configuration.paths import make_dlt_settings_path from dlt.common.configuration.utils import serialize_value from dlt.common.git import is_dirty @@ -393,12 +393,7 @@ def _echo_instructions(self, *args: Optional[Any]) -> None: f" {SECRETS_TOML_KEY} variable." ) fmt.echo() - toml_provider = StringTomlProvider("") - for s_v in self.secret_envs: - toml_provider.set_value(s_v.key, s_v.value, None, *s_v.sections) - for s_v in self.envs: - toml_provider.set_value(s_v.key, s_v.value, None, *s_v.sections) - fmt.echo(toml_provider.dumps()) + self._echo_secrets_toml() else: raise ValueError(self.secrets_format) diff --git a/dlt/cli/deploy_command_helpers.py b/dlt/cli/deploy_command_helpers.py index 5fe46415dd..2afbfbf46e 100644 --- a/dlt/cli/deploy_command_helpers.py +++ b/dlt/cli/deploy_command_helpers.py @@ -14,8 +14,9 @@ import dlt from dlt.common import git -from dlt.common.configuration.exceptions import LookupTrace +from dlt.common.configuration.exceptions import LookupTrace, ConfigFieldMissingException from dlt.common.configuration.providers import ConfigTomlProvider, EnvironProvider +from dlt.common.configuration.providers.toml import BaseDocProvider, StringTomlProvider from dlt.common.git import get_origin, get_repo, Repo from dlt.common.configuration.specs.run_configuration import get_default_pipeline_name from dlt.common.typing import StrAny @@ -198,12 +199,50 @@ def _update_envs(self, trace: PipelineTrace) -> None: # fmt.echo(f"{resolved_value.key} in {resolved_value.sections} moved to CONFIG") def _echo_secrets(self) -> None: + display_info = False for s_v in self.secret_envs: fmt.secho("Name:", fg="green") fmt.echo(fmt.bold(self.env_prov.get_key_name(s_v.key, *s_v.sections))) - fmt.secho("Secret:", fg="green") - fmt.echo(s_v.value) + try: + fmt.secho("Secret:", fg="green") + fmt.echo(self._lookup_secret_value(s_v)) + except ConfigFieldMissingException: + fmt.secho("please set me up!", fg="red") + display_info = True fmt.echo() + if display_info: + self._display_missing_secret_info() + fmt.echo() + + def _echo_secrets_toml(self) -> None: + display_info = False + toml_provider = StringTomlProvider("") + for s_v in self.secret_envs: + try: + secret_value = self._lookup_secret_value(s_v) + except ConfigFieldMissingException: + secret_value = "please set me up!" + display_info = True + toml_provider.set_value(s_v.key, secret_value, None, *s_v.sections) + for s_v in self.envs: + toml_provider.set_value(s_v.key, s_v.value, None, *s_v.sections) + fmt.echo(toml_provider.dumps()) + if display_info: + self._display_missing_secret_info() + fmt.echo() + + def _display_missing_secret_info(self) -> None: + fmt.warning( + "We could not read and display some secrets. Starting from 1.0 version of dlt," + " those are not stored in the traces. Instead we are trying to read them from the" + " available configuration ie. secrets.toml file. Please run the deploy command from" + " the same working directory you ran your pipeline script. If you pass the" + " credentials in code we will not be able to display them here. See" + " https://dlthub.com/docs/general-usage/credentials" + ) + + def _lookup_secret_value(self, trace: LookupTrace) -> Any: + return dlt.secrets[BaseDocProvider.get_key_name(trace.key, *trace.sections)] def _echo_envs(self) -> None: for v in self.envs: diff --git a/dlt/cli/echo.py b/dlt/cli/echo.py index bd9cf24f64..302b74b076 100644 --- a/dlt/cli/echo.py +++ b/dlt/cli/echo.py @@ -15,9 +15,11 @@ def always_choose(always_choose_default: bool, always_choose_value: Any) -> Iter _always_choose_value = ALWAYS_CHOOSE_VALUE ALWAYS_CHOOSE_DEFAULT = always_choose_default ALWAYS_CHOOSE_VALUE = always_choose_value - yield - ALWAYS_CHOOSE_DEFAULT = _always_choose_default - ALWAYS_CHOOSE_VALUE = _always_choose_value + try: + yield + finally: + ALWAYS_CHOOSE_DEFAULT = _always_choose_default + ALWAYS_CHOOSE_VALUE = _always_choose_value echo = click.echo diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py index 2f857e5fd5..c47926e5f4 100644 --- a/dlt/pipeline/trace.py +++ b/dlt/pipeline/trace.py @@ -3,14 +3,14 @@ import os import pickle import datetime # noqa: 251 -from typing import Any, List, NamedTuple, Optional, Protocol, Sequence +from typing import Any, List, NamedTuple, Optional, Protocol, Sequence, Union import humanize from dlt.common.pendulum import pendulum from dlt.common.configuration import is_secret_hint from dlt.common.configuration.exceptions import ContextDefaultCannotBeCreated from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.configuration.utils import _RESOLVED_TRACES +from dlt.common.configuration.utils import _RESOLVED_TRACES, ResolvedValueTrace from dlt.common.configuration.container import Container from dlt.common.exceptions import ExceptionTrace, ResourceNameNotAvailable from dlt.common.logger import suppress_and_warn @@ -280,8 +280,8 @@ def end_trace_step( resolved_values = map( lambda v: SerializableResolvedValueTrace( v.key, - v.value, - v.default_value, + None if is_secret_hint(v.hint) else v.value, + None if is_secret_hint(v.hint) else v.default_value, is_secret_hint(v.hint), v.sections, v.provider_name, diff --git a/tests/cli/test_deploy_command.py b/tests/cli/test_deploy_command.py index 685921ca6e..78a14ee914 100644 --- a/tests/cli/test_deploy_command.py +++ b/tests/cli/test_deploy_command.py @@ -19,7 +19,7 @@ from dlt.pipeline.exceptions import CannotRestorePipelineException from dlt.cli.deploy_command_helpers import get_schedule_description -from tests.utils import TEST_STORAGE_ROOT, test_storage +from tests.utils import TEST_STORAGE_ROOT, reset_providers, test_storage DEPLOY_PARAMS = [ @@ -134,28 +134,34 @@ def test_deploy_command( test_storage.delete(".dlt/secrets.toml") test_storage.atomic_rename(".dlt/secrets.toml.ci", ".dlt/secrets.toml") - # this time script will run - venv.run_script("debug_pipeline.py") - with echo.always_choose(False, always_choose_value=True): - with io.StringIO() as buf, contextlib.redirect_stdout(buf): - deploy_command.deploy_command( - "debug_pipeline.py", - deployment_method, - deploy_command.COMMAND_DEPLOY_REPO_LOCATION, - **deployment_args - ) - _out = buf.getvalue() - print(_out) - # make sure our secret and config values are all present - assert "api_key_9x3ehash" in _out - assert "dlt_data" in _out - if "schedule" in deployment_args: - assert get_schedule_description(deployment_args["schedule"]) - secrets_format = deployment_args.get("secrets_format", "env") - if secrets_format == "env": - assert "API_KEY" in _out - else: - assert "api_key = " in _out + # reset toml providers to (1) CWD (2) non existing dir so API_KEY is not found + for project_dir, api_key in [ + (None, "api_key_9x3ehash"), + (".", "please set me up!"), + ]: + with reset_providers(project_dir=project_dir): + # this time script will run + venv.run_script("debug_pipeline.py") + with echo.always_choose(False, always_choose_value=True): + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + deploy_command.deploy_command( + "debug_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) + _out = buf.getvalue() + print(_out) + # make sure our secret and config values are all present + assert api_key in _out + assert "dlt_data" in _out + if "schedule" in deployment_args: + assert get_schedule_description(deployment_args["schedule"]) + secrets_format = deployment_args.get("secrets_format", "env") + if secrets_format == "env": + assert "API_KEY" in _out + else: + assert "api_key = " in _out # non existing script name with pytest.raises(NoSuchPathError): diff --git a/tests/common/configuration/utils.py b/tests/common/configuration/utils.py index d7a46e9084..677ec3d329 100644 --- a/tests/common/configuration/utils.py +++ b/tests/common/configuration/utils.py @@ -20,15 +20,11 @@ from dlt.common.configuration import configspec from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.container import Container -from dlt.common.configuration.providers import ( - ConfigProvider, - EnvironProvider, - ConfigTomlProvider, - SecretsTomlProvider, -) +from dlt.common.configuration.providers import ConfigProvider, EnvironProvider from dlt.common.configuration.utils import get_resolved_traces from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.typing import TSecretValue, StrAny +from tests.utils import _reset_providers @configspec @@ -118,14 +114,7 @@ def env_provider() -> Iterator[ConfigProvider]: @pytest.fixture def toml_providers() -> Iterator[ConfigProvidersContext]: - pipeline_root = "./tests/common/cases/configuration/.dlt" - ctx = ConfigProvidersContext() - ctx.providers.clear() - ctx.add_provider(EnvironProvider()) - ctx.add_provider(SecretsTomlProvider(project_dir=pipeline_root)) - ctx.add_provider(ConfigTomlProvider(project_dir=pipeline_root)) - with Container().injectable_context(ctx): - yield ctx + yield from _reset_providers("./tests/common/cases/configuration/.dlt") class MockProvider(ConfigProvider): diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 9c12519a89..433913851f 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -115,17 +115,18 @@ def data(): assert resolved.is_secret_hint is False assert resolved.default_value is None assert resolved.provider_name == "config.toml" - # dictionaries are not returned anymore + # dictionaries are not returned anymore, secrets are masked resolved = _find_resolved_value(trace.resolved_config_values, "credentials", []) assert resolved is None or isinstance(resolved.value, str) resolved = _find_resolved_value(trace.resolved_config_values, "secret_value", []) assert resolved.is_secret_hint is True - assert resolved.value == "2137" - assert resolved.default_value == "123" + assert resolved.value is None, "Credential is not masked" + assert resolved.default_value is None, "Credential is not masked" resolved = _find_resolved_value(trace.resolved_config_values, "credentials", ["databricks"]) assert resolved.is_secret_hint is True - assert resolved.value == databricks_creds + assert resolved.value is None, "Credential is not masked" assert_trace_serializable(trace) + # activate pipeline because other was running in assert trace p.activate() diff --git a/tests/utils.py b/tests/utils.py index e90ac5a626..813deea69f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,4 @@ +import contextlib import multiprocessing import os import platform @@ -12,7 +13,12 @@ import dlt from dlt.common.configuration.container import Container -from dlt.common.configuration.providers import DictionaryProvider +from dlt.common.configuration.providers import ( + DictionaryProvider, + EnvironProvider, + SecretsTomlProvider, + ConfigTomlProvider, +) from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.specs import RunConfiguration from dlt.common.configuration.specs.config_providers_context import ( @@ -382,3 +388,19 @@ def assert_query_data( # the second is load id if info: assert row[1] in info.loads_ids + + +@contextlib.contextmanager +def reset_providers(project_dir: str) -> Iterator[ConfigProvidersContext]: + """Context manager injecting standard set of providers where toml providers are initialized from `project_dir`""" + return _reset_providers(project_dir) + + +def _reset_providers(project_dir: str) -> Iterator[ConfigProvidersContext]: + ctx = ConfigProvidersContext() + ctx.providers.clear() + ctx.add_provider(EnvironProvider()) + ctx.add_provider(SecretsTomlProvider(project_dir=project_dir)) + ctx.add_provider(ConfigTomlProvider(project_dir=project_dir)) + with Container().injectable_context(ctx): + yield ctx