From a85005018448d551bdc9c7bd3c2c9d00e268aa0f Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Wed, 20 Nov 2024 11:49:25 -0500
Subject: [PATCH] feat(deid): keep some fields from
 DiagnosticReport.presentedForm

Specifically, treat it the same as we treat DocumentReference.content.
- Strip data & url into _data & _url data-absent-reason extensions.
- But keep all the interesting metadata like contentType and language.
- Add Binary scope when requesting DiagnosticReport scopes.
---
 cumulus_etl/deid/ms-config.json               |  2 +-
 cumulus_etl/deid/scrubber.py                  | 12 +++---
 cumulus_etl/fhir/fhir_client.py               |  5 ++-
 .../data/mstool/input/DiagnosticReport.ndjson |  2 +-
 .../mstool/output/DiagnosticReport.ndjson     |  3 +-
 tests/deid/test_deid_scrubber.py              | 39 +++++++++++++++++++
 6 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/cumulus_etl/deid/ms-config.json b/cumulus_etl/deid/ms-config.json
index e88dcacc..4bc32cde 100644
--- a/cumulus_etl/deid/ms-config.json
+++ b/cumulus_etl/deid/ms-config.json
@@ -172,7 +172,7 @@
     {"path": "DiagnosticReport.media.link", "method": "keep"},
     // Skip DiagnosticReport.conclusion
     {"path": "DiagnosticReport.conclusionCode", "method": "keep"},
-    // Skip DiagnosticReport.presentedForm (can add back later when/if we want to run NLP on it)
+    {"path": "DiagnosticReport.presentedForm", "method": "keep"}, // will be dropped later after running NLP on it
 
     // ** DocumentReference: https://www.hl7.org/fhir/R4/documentreference.html **
     // Skip DocumentReference.masterIdentifier
diff --git a/cumulus_etl/deid/scrubber.py b/cumulus_etl/deid/scrubber.py
index cfb9e0b5..58a6e7d3 100644
--- a/cumulus_etl/deid/scrubber.py
+++ b/cumulus_etl/deid/scrubber.py
@@ -436,12 +436,14 @@ def _check_text(self, key: str, value: Any) -> Any:
     @staticmethod
     def _check_attachments(resource_type: str, node_path: str, key: str, value: Any) -> Any:
         """Strip any attachment data"""
-        if (
-            resource_type == "DocumentReference"
-            and node_path == "root.content.attachment"
-            and key in {"data", "url"}
+        if any(
+            (
+                (resource_type == "DiagnosticReport" and node_path == "root.presentedForm"),
+                (resource_type == "DocumentReference" and node_path == "root.content.attachment"),
+            )
         ):
-            raise MaskValue
+            if key in {"data", "url"}:
+                raise MaskValue
 
         return value
 
diff --git a/cumulus_etl/fhir/fhir_client.py b/cumulus_etl/fhir/fhir_client.py
index 5b11b559..9a0fb7f2 100644
--- a/cumulus_etl/fhir/fhir_client.py
+++ b/cumulus_etl/fhir/fhir_client.py
@@ -283,8 +283,9 @@ def create_fhir_client_for_cli(
         raise SystemExit(errors.ARGS_INVALID) from exc
 
     client_resources = set(resources)
-    if "DocumentReference" in client_resources:
-        # A DocumentReference scope implies a Binary scope as well, since we'll usually need to download attachments
+    if {"DiagnosticReport", "DocumentReference"} & client_resources:
+        # Resources with attachments imply a Binary scope as well,
+        # since we'll usually need to download the referenced content.
         client_resources.add("Binary")
 
     return FhirClient(
diff --git a/tests/data/mstool/input/DiagnosticReport.ndjson b/tests/data/mstool/input/DiagnosticReport.ndjson
index 184337f4..4536f824 100644
--- a/tests/data/mstool/input/DiagnosticReport.ndjson
+++ b/tests/data/mstool/input/DiagnosticReport.ndjson
@@ -20,5 +20,5 @@
   }],
   "conclusion" : "dropped",
   "conclusionCode" : [{ "text": "kept" }],
-  "presentedForm" : [{ "title": "dropped" }]
+  "presentedForm" : [{ "data": "xxx", "title": "dropped" }]
 }
\ No newline at end of file
diff --git a/tests/data/mstool/output/DiagnosticReport.ndjson b/tests/data/mstool/output/DiagnosticReport.ndjson
index 9bba3b06..a135430e 100644
--- a/tests/data/mstool/output/DiagnosticReport.ndjson
+++ b/tests/data/mstool/output/DiagnosticReport.ndjson
@@ -23,5 +23,6 @@
   "media" : [{
     "link" : { "reference": "Media/x" }
   }],
-  "conclusionCode" : [{ "text": "kept" }]
+  "conclusionCode" : [{ "text": "kept" }],
+  "presentedForm" : [{ "data": "xxx" }]
 }
\ No newline at end of file
diff --git a/tests/deid/test_deid_scrubber.py b/tests/deid/test_deid_scrubber.py
index c26d46cb..e9afdc76 100644
--- a/tests/deid/test_deid_scrubber.py
+++ b/tests/deid/test_deid_scrubber.py
@@ -66,6 +66,45 @@ def test_condition(self):
             f"Encounter/{scrubber.codebook.fake_id('Encounter', '67890')}",
         )
 
+    def test_diagnosticreport(self):
+        """Verify a basic DiagnosticReport has attachments stripped"""
+        report = {
+            "resourceType": "DiagnosticReport",
+            "id": "dr1",
+            "presentedForm": [
+                {
+                    "data": "blarg",
+                    "language": "en",
+                    "size": 5,
+                },
+                {
+                    "url": "https://example.com/",
+                    "contentType": "text/plain",
+                },
+            ],
+        }
+
+        scrubber = Scrubber()
+        self.assertTrue(scrubber.scrub_resource(report))
+        self.assertEqual(
+            report,
+            {
+                "resourceType": "DiagnosticReport",
+                "id": scrubber.codebook.fake_id("DiagnosticReport", "dr1"),
+                "presentedForm": [
+                    {
+                        "_data": MASKED_EXTENSION,
+                        "language": "en",
+                        "size": 5,
+                    },
+                    {
+                        "_url": MASKED_EXTENSION,
+                        "contentType": "text/plain",
+                    },
+                ],
+            },
+        )
+
     def test_documentreference(self):
         """Test DocumentReference, which is interesting because of its list of encounters and attachments"""
         docref = {