From daad9dfa1c506762ba46d3b2f829e92383c22979 Mon Sep 17 00:00:00 2001
From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com>
Date: Wed, 29 May 2024 11:50:27 -0400
Subject: [PATCH] Redact storage options, as this may include user keys. (#322)

* Redact storage options, as this may include user keys.

* Add motivational comment.
---
 src/hipscat_import/catalog/file_readers.py        |  7 +++++++
 tests/hipscat_import/catalog/test_file_readers.py | 10 +++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/hipscat_import/catalog/file_readers.py b/src/hipscat_import/catalog/file_readers.py
index 9119f630..f3f27bf0 100644
--- a/src/hipscat_import/catalog/file_readers.py
+++ b/src/hipscat_import/catalog/file_readers.py
@@ -81,9 +81,16 @@ def read(self, input_file, read_columns=None):
     def provenance_info(self) -> dict:
         """Create dictionary of parameters for provenance tracking.
 
+        If any `storage_options` have been provided as kwargs, we will replace the
+        value with ``REDACTED`` for the purpose of writing to provenance info, as it
+        may contain user names or API keys.
+
         Returns:
             dictionary with all argument_name -> argument_value as key -> value pairs.
         """
+        all_args = vars(self)
+        if "kwargs" in all_args and "storage_options" in all_args["kwargs"]:
+            all_args["kwargs"]["storage_options"] = "REDACTED"
         return {"input_reader_type": type(self).__name__, **vars(self)}
 
     def regular_file_exists(self, input_file, storage_options: Union[Dict[Any, Any], None] = None, **_kwargs):
diff --git a/tests/hipscat_import/catalog/test_file_readers.py b/tests/hipscat_import/catalog/test_file_readers.py
index c7f76176..677b4a7d 100644
--- a/tests/hipscat_import/catalog/test_file_readers.py
+++ b/tests/hipscat_import/catalog/test_file_readers.py
@@ -220,12 +220,20 @@ def test_csv_reader_provenance_info(tmp_path, basic_catalog_info):
             "empty": "Int64",
             "numeric": int,
         },
+        storage_options={"user_name": "user_pii", "user_key": "SECRETS!"},
     )
     provenance_info = reader.provenance_info()
-    catalog_base_dir = os.path.join(tmp_path, "test_catalog")
+    catalog_base_dir = tmp_path / "test_catalog"
     os.makedirs(catalog_base_dir)
     io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info)
 
+    with open(catalog_base_dir / "provenance_info.json", "r", encoding="utf-8") as file:
+        data = file.read()
+        assert "test_catalog" in data
+        assert "REDACTED" in data
+        assert "user_pii" not in data
+        assert "SECRETS" not in data
+
 
 def test_parquet_reader(parquet_shards_shard_44_0):
     """Verify we can read the parquet file into a single data frame."""