From 4b782f2ac8678f72439fdba4f26e3d41dd274b9b Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Thu, 7 Nov 2024 10:11:21 -0500 Subject: [PATCH] feat: mark data/url fields inside DocRef attachments as absent When we strip the data & url fields from DocumentReferences, we previously just deleted the fields and moved on. But in order to be able to do some QA analysis on whether those fields were provided in the first place, it's nice to have a record of them. So whenever we delete those fields, we also now leave a data-absent extension in place with the value "masked". --- cumulus_etl/deid/scrubber.py | 27 ++++++- .../documentreference.000.ndjson | 4 +- .../documentreference.000.ndjson | 2 +- .../documentreference.001.ndjson | 2 +- .../documentreference.000.ndjson | 4 +- tests/deid/test_deid_scrubber.py | 80 +++++++++++++++++-- 6 files changed, 104 insertions(+), 15 deletions(-) diff --git a/cumulus_etl/deid/scrubber.py b/cumulus_etl/deid/scrubber.py index fac259f9..0356a8ea 100644 --- a/cumulus_etl/deid/scrubber.py +++ b/cumulus_etl/deid/scrubber.py @@ -23,6 +23,10 @@ class SkipValue(Exception): pass +class MaskValue(SkipValue): + pass + + class Scrubber: """ Manages de-identification for FHIR resources. @@ -160,6 +164,9 @@ def _scrub_node( inside_extension=inside_extension, ) ) + except MaskValue: + # TODO: (not needed yet) support masking values inside array fields + self._add_data_absent_extension(node, f"_{key}") except SkipValue: pass @@ -224,6 +231,24 @@ def _print_extension_table(self, title: str, table: ExtensionCount) -> None: indented = rich.padding.Padding.indent(tree, 1) rich.get_console().print(indented) + def _add_data_absent_extension(self, node: dict, parent: str) -> None: + element = node.setdefault(parent, {}) + extensions = element.setdefault("extension", []) + + # Check if the value is already marked as absent for any reason - leave it in place. + # (though that would be weird, since the field was present or we wouldn't be in this path) + for extension in extensions: + if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/data-absent-reason": + return + + # See https://hl7.org/fhir/extensions/StructureDefinition-data-absent-reason.html + extensions.append( + { + "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", + "valueCode": "masked", + } + ) + ############################################################################### # # Individual checkers @@ -402,7 +427,7 @@ def _check_attachments(resource_type: str, node_path: str, key: str, value: Any) and node_path == "root.content.attachment" and key in {"data", "url"} ): - raise SkipValue + raise MaskValue return value diff --git a/tests/data/i2b2/output/documentreference/documentreference.000.ndjson b/tests/data/i2b2/output/documentreference/documentreference.000.ndjson index 139b538a..9090bfde 100644 --- a/tests/data/i2b2/output/documentreference/documentreference.000.ndjson +++ b/tests/data/i2b2/output/documentreference/documentreference.000.ndjson @@ -1,2 +1,2 @@ -{"resourceType":"DocumentReference","id":"228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0","subject":{"reference":"Patient\/26f4d6d38eaa3347b8bd22bb4bc66ecbff5384926152738d282e841a247bfefb"},"context":{"encounter":[{"reference":"Encounter\/5388b42b262276bfbcb659b1ff937b0e3e5b0ec8901ed3ad53fa387fd6f2589f"}],"period":{"start":"2021-06-23","end":"2021-06-24"}},"type":{"coding":[{"code":"NOTE:149798455","system":"http:\/\/cumulus.smarthealthit.org\/i2b2","display":"Admission MD"}]},"status":"current","content":[{"attachment":{"contentType":"text\/plain"}}]} -{"resourceType":"DocumentReference","id":"dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588","subject":{"reference":"Patient\/49fbb06b4b49eb49a096cf2a96674fb84a4d52ee74ec25c8f6f26023cb4764a7"},"context":{"encounter":[{"reference":"Encounter\/fb29ea2a68ca2e1e4bbe22bdeedf021d94ec89f7e3d38ecbe908a8f2b3d89687"}],"period":{"start":"2021-06-24","end":"2021-06-25"}},"type":{"coding":[{"code":"NOTE:149798455","system":"http:\/\/cumulus.smarthealthit.org\/i2b2","display":"Admission MD"}]},"status":"current","content":[{"attachment":{"contentType":"text\/plain"}}]} +{"resourceType": "DocumentReference", "id": "228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0", "subject": {"reference": "Patient/26f4d6d38eaa3347b8bd22bb4bc66ecbff5384926152738d282e841a247bfefb"}, "context": {"encounter": [{"reference": "Encounter/5388b42b262276bfbcb659b1ff937b0e3e5b0ec8901ed3ad53fa387fd6f2589f"}], "period": {"start": "2021-06-23", "end": "2021-06-24"}}, "type": {"coding": [{"code": "NOTE:149798455", "system": "http://cumulus.smarthealthit.org/i2b2", "display": "Admission MD"}]}, "status": "current", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}]} +{"resourceType": "DocumentReference", "id": "dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588", "subject": {"reference": "Patient/49fbb06b4b49eb49a096cf2a96674fb84a4d52ee74ec25c8f6f26023cb4764a7"}, "context": {"encounter": [{"reference": "Encounter/fb29ea2a68ca2e1e4bbe22bdeedf021d94ec89f7e3d38ecbe908a8f2b3d89687"}], "period": {"start": "2021-06-24", "end": "2021-06-25"}}, "type": {"coding": [{"code": "NOTE:149798455", "system": "http://cumulus.smarthealthit.org/i2b2", "display": "Admission MD"}]}, "status": "current", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}]} diff --git a/tests/data/simple/batched-output/documentreference/documentreference.000.ndjson b/tests/data/simple/batched-output/documentreference/documentreference.000.ndjson index b289375e..d732b323 100644 --- a/tests/data/simple/batched-output/documentreference/documentreference.000.ndjson +++ b/tests/data/simple/batched-output/documentreference/documentreference.000.ndjson @@ -1 +1 @@ -{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/d30aad4b-4503-8e22-0bc4-621b94398520"}],"period":{"end":"2021-06-24","start":"2021-06-23"}},"status":"current","subject":{"reference":"Patient\/118dc10e-7745-20d7-e98d-7c358a84c15c"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"} +{"resourceType": "DocumentReference", "id": "f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/d30aad4b-4503-8e22-0bc4-621b94398520"}], "period": {"end": "2021-06-24", "start": "2021-06-23"}}, "status": "current", "subject": {"reference": "Patient/118dc10e-7745-20d7-e98d-7c358a84c15c"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}} diff --git a/tests/data/simple/batched-output/documentreference/documentreference.001.ndjson b/tests/data/simple/batched-output/documentreference/documentreference.001.ndjson index 5f1a69c2..ec78ea8d 100644 --- a/tests/data/simple/batched-output/documentreference/documentreference.001.ndjson +++ b/tests/data/simple/batched-output/documentreference/documentreference.001.ndjson @@ -1 +1 @@ -{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}],"period":{"end":"2021-06-25","start":"2021-06-24"}},"status":"current","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"} +{"resourceType": "DocumentReference", "id": "c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}], "period": {"end": "2021-06-25", "start": "2021-06-24"}}, "status": "current", "subject": {"reference": "Patient/1de9ea66-70d3-da1f-c735-df5ef7697fb9"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}} diff --git a/tests/data/simple/output/documentreference/documentreference.000.ndjson b/tests/data/simple/output/documentreference/documentreference.000.ndjson index 8fa2914d..2adcb213 100644 --- a/tests/data/simple/output/documentreference/documentreference.000.ndjson +++ b/tests/data/simple/output/documentreference/documentreference.000.ndjson @@ -1,2 +1,2 @@ -{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/d30aad4b-4503-8e22-0bc4-621b94398520"}],"period":{"end":"2021-06-24","start":"2021-06-23"}},"status":"current","subject":{"reference":"Patient\/118dc10e-7745-20d7-e98d-7c358a84c15c"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"} -{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}],"period":{"end":"2021-06-25","start":"2021-06-24"}},"status":"current","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"} +{"resourceType": "DocumentReference", "id": "f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/d30aad4b-4503-8e22-0bc4-621b94398520"}], "period": {"end": "2021-06-24", "start": "2021-06-23"}}, "status": "current", "subject": {"reference": "Patient/118dc10e-7745-20d7-e98d-7c358a84c15c"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}} +{"resourceType": "DocumentReference", "id": "c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}], "period": {"end": "2021-06-25", "start": "2021-06-24"}}, "status": "current", "subject": {"reference": "Patient/1de9ea66-70d3-da1f-c735-df5ef7697fb9"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}} diff --git a/tests/deid/test_deid_scrubber.py b/tests/deid/test_deid_scrubber.py index dcecfaf3..5ab881ae 100644 --- a/tests/deid/test_deid_scrubber.py +++ b/tests/deid/test_deid_scrubber.py @@ -58,13 +58,36 @@ def test_condition(self): def test_documentreference(self): """Test DocumentReference, which is interesting because of its list of encounters and attachments""" - docref = i2b2_mock_data.documentreference() - self.assertEqual("345", docref["id"]) - self.assertEqual("Patient/12345", docref["subject"]["reference"]) - self.assertEqual(1, len(docref["context"]["encounter"])) - self.assertEqual("Encounter/67890", docref["context"]["encounter"][0]["reference"]) - self.assertEqual(1, len(docref["content"])) - self.assertIsNotNone(docref["content"][0]["attachment"]["data"]) + docref = { + "resourceType": "DocumentReference", + "id": "345", + "subject": {"reference": "Patient/12345"}, + "context": { + "encounter": [{"reference": "Encounter/67890"}], + }, + "content": [ + { + "attachment": { + "data": "aGVsbG8gd29ybGQ=", + "url": "https://example.com/hello-world", + }, + }, + { + "attachment": { + "data": "xxx", + "_data": { + "extension": [ + { + "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", + "valueCode": "error", + } + ], + }, + "url": "https://example.com/hello-world", + }, + }, + ], + } scrubber = Scrubber() self.assertTrue(scrubber.scrub_resource(docref)) @@ -77,7 +100,48 @@ def test_documentreference(self): docref["context"]["encounter"][0]["reference"], f"Encounter/{scrubber.codebook.fake_id('Encounter', '67890')}", ) - self.assertNotIn("data", docref["content"][0]["attachment"]) + self.assertEqual( + docref["content"][0]["attachment"], + { + "_data": { + "extension": [ + { + "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", + "valueCode": "masked", + } + ] + }, + "_url": { + "extension": [ + { + "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", + "valueCode": "masked", + } + ] + }, + }, + ) + self.assertEqual( + docref["content"][1]["attachment"], + { + "_data": { + "extension": [ + { + "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", + "valueCode": "error", # we left this reason in place + } + ] + }, + "_url": { + "extension": [ + { + "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", + "valueCode": "masked", + } + ] + }, + }, + ) def test_contained_reference(self): """Verify that we leave contained references contained but scrubbed"""