From a85005018448d551bdc9c7bd3c2c9d00e268aa0f Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Wed, 20 Nov 2024 11:49:25 -0500 Subject: [PATCH] feat(deid): keep some fields from DiagnosticReport.presentedForm Specifically, treat it the same as we treat DocumentReference.content. - Strip data & url into _data & _url data-absent-reason extensions. - But keep all the interesting metadata like contentType and language. - Add Binary scope when requesting DiagnosticReport scopes. --- cumulus_etl/deid/ms-config.json | 2 +- cumulus_etl/deid/scrubber.py | 12 +++--- cumulus_etl/fhir/fhir_client.py | 5 ++- .../data/mstool/input/DiagnosticReport.ndjson | 2 +- .../mstool/output/DiagnosticReport.ndjson | 3 +- tests/deid/test_deid_scrubber.py | 39 +++++++++++++++++++ 6 files changed, 53 insertions(+), 10 deletions(-) diff --git a/cumulus_etl/deid/ms-config.json b/cumulus_etl/deid/ms-config.json index e88dcacc..4bc32cde 100644 --- a/cumulus_etl/deid/ms-config.json +++ b/cumulus_etl/deid/ms-config.json @@ -172,7 +172,7 @@ {"path": "DiagnosticReport.media.link", "method": "keep"}, // Skip DiagnosticReport.conclusion {"path": "DiagnosticReport.conclusionCode", "method": "keep"}, - // Skip DiagnosticReport.presentedForm (can add back later when/if we want to run NLP on it) + {"path": "DiagnosticReport.presentedForm", "method": "keep"}, // will be dropped later after running NLP on it // ** DocumentReference: https://www.hl7.org/fhir/R4/documentreference.html ** // Skip DocumentReference.masterIdentifier diff --git a/cumulus_etl/deid/scrubber.py b/cumulus_etl/deid/scrubber.py index cfb9e0b5..58a6e7d3 100644 --- a/cumulus_etl/deid/scrubber.py +++ b/cumulus_etl/deid/scrubber.py @@ -436,12 +436,14 @@ def _check_text(self, key: str, value: Any) -> Any: @staticmethod def _check_attachments(resource_type: str, node_path: str, key: str, value: Any) -> Any: """Strip any attachment data""" - if ( - resource_type == "DocumentReference" - and node_path == "root.content.attachment" - and key in {"data", "url"} + if any( + ( + (resource_type == "DiagnosticReport" and node_path == "root.presentedForm"), + (resource_type == "DocumentReference" and node_path == "root.content.attachment"), + ) ): - raise MaskValue + if key in {"data", "url"}: + raise MaskValue return value diff --git a/cumulus_etl/fhir/fhir_client.py b/cumulus_etl/fhir/fhir_client.py index 5b11b559..9a0fb7f2 100644 --- a/cumulus_etl/fhir/fhir_client.py +++ b/cumulus_etl/fhir/fhir_client.py @@ -283,8 +283,9 @@ def create_fhir_client_for_cli( raise SystemExit(errors.ARGS_INVALID) from exc client_resources = set(resources) - if "DocumentReference" in client_resources: - # A DocumentReference scope implies a Binary scope as well, since we'll usually need to download attachments + if {"DiagnosticReport", "DocumentReference"} & client_resources: + # Resources with attachments imply a Binary scope as well, + # since we'll usually need to download the referenced content. client_resources.add("Binary") return FhirClient( diff --git a/tests/data/mstool/input/DiagnosticReport.ndjson b/tests/data/mstool/input/DiagnosticReport.ndjson index 184337f4..4536f824 100644 --- a/tests/data/mstool/input/DiagnosticReport.ndjson +++ b/tests/data/mstool/input/DiagnosticReport.ndjson @@ -20,5 +20,5 @@ }], "conclusion" : "dropped", "conclusionCode" : [{ "text": "kept" }], - "presentedForm" : [{ "title": "dropped" }] + "presentedForm" : [{ "data": "xxx", "title": "dropped" }] } \ No newline at end of file diff --git a/tests/data/mstool/output/DiagnosticReport.ndjson b/tests/data/mstool/output/DiagnosticReport.ndjson index 9bba3b06..a135430e 100644 --- a/tests/data/mstool/output/DiagnosticReport.ndjson +++ b/tests/data/mstool/output/DiagnosticReport.ndjson @@ -23,5 +23,6 @@ "media" : [{ "link" : { "reference": "Media/x" } }], - "conclusionCode" : [{ "text": "kept" }] + "conclusionCode" : [{ "text": "kept" }], + "presentedForm" : [{ "data": "xxx" }] } \ No newline at end of file diff --git a/tests/deid/test_deid_scrubber.py b/tests/deid/test_deid_scrubber.py index c26d46cb..e9afdc76 100644 --- a/tests/deid/test_deid_scrubber.py +++ b/tests/deid/test_deid_scrubber.py @@ -66,6 +66,45 @@ def test_condition(self): f"Encounter/{scrubber.codebook.fake_id('Encounter', '67890')}", ) + def test_diagnosticreport(self): + """Verify a basic DiagnosticReport has attachments stripped""" + report = { + "resourceType": "DiagnosticReport", + "id": "dr1", + "presentedForm": [ + { + "data": "blarg", + "language": "en", + "size": 5, + }, + { + "url": "https://example.com/", + "contentType": "text/plain", + }, + ], + } + + scrubber = Scrubber() + self.assertTrue(scrubber.scrub_resource(report)) + self.assertEqual( + report, + { + "resourceType": "DiagnosticReport", + "id": scrubber.codebook.fake_id("DiagnosticReport", "dr1"), + "presentedForm": [ + { + "_data": MASKED_EXTENSION, + "language": "en", + "size": 5, + }, + { + "_url": MASKED_EXTENSION, + "contentType": "text/plain", + }, + ], + }, + ) + def test_documentreference(self): """Test DocumentReference, which is interesting because of its list of encounters and attachments""" docref = {