From daad9dfa1c506762ba46d3b2f829e92383c22979 Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Wed, 29 May 2024 11:50:27 -0400 Subject: [PATCH] Redact storage options, as this may include user keys. (#322) * Redact storage options, as this may include user keys. * Add motivational comment. --- src/hipscat_import/catalog/file_readers.py | 7 +++++++ tests/hipscat_import/catalog/test_file_readers.py | 10 +++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/hipscat_import/catalog/file_readers.py b/src/hipscat_import/catalog/file_readers.py index 9119f630..f3f27bf0 100644 --- a/src/hipscat_import/catalog/file_readers.py +++ b/src/hipscat_import/catalog/file_readers.py @@ -81,9 +81,16 @@ def read(self, input_file, read_columns=None): def provenance_info(self) -> dict: """Create dictionary of parameters for provenance tracking. + If any `storage_options` have been provided as kwargs, we will replace the + value with ``REDACTED`` for the purpose of writing to provenance info, as it + may contain user names or API keys. + Returns: dictionary with all argument_name -> argument_value as key -> value pairs. """ + all_args = vars(self) + if "kwargs" in all_args and "storage_options" in all_args["kwargs"]: + all_args["kwargs"]["storage_options"] = "REDACTED" return {"input_reader_type": type(self).__name__, **vars(self)} def regular_file_exists(self, input_file, storage_options: Union[Dict[Any, Any], None] = None, **_kwargs): diff --git a/tests/hipscat_import/catalog/test_file_readers.py b/tests/hipscat_import/catalog/test_file_readers.py index c7f76176..677b4a7d 100644 --- a/tests/hipscat_import/catalog/test_file_readers.py +++ b/tests/hipscat_import/catalog/test_file_readers.py @@ -220,12 +220,20 @@ def test_csv_reader_provenance_info(tmp_path, basic_catalog_info): "empty": "Int64", "numeric": int, }, + storage_options={"user_name": "user_pii", "user_key": "SECRETS!"}, ) provenance_info = reader.provenance_info() - catalog_base_dir = os.path.join(tmp_path, "test_catalog") + catalog_base_dir = tmp_path / "test_catalog" os.makedirs(catalog_base_dir) io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) + with open(catalog_base_dir / "provenance_info.json", "r", encoding="utf-8") as file: + data = file.read() + assert "test_catalog" in data + assert "REDACTED" in data + assert "user_pii" not in data + assert "SECRETS" not in data + def test_parquet_reader(parquet_shards_shard_44_0): """Verify we can read the parquet file into a single data frame."""