Skip to content

Commit

Permalink
feat: mark data/url fields inside DocRef attachments as absent
Browse files Browse the repository at this point in the history
When we strip the data & url fields from DocumentReferences, we
previously just deleted the fields and moved on.

But in order to be able to do some QA analysis on whether those fields
were provided in the first place, it's nice to have a record of them.

So whenever we delete those fields, we also now leave a data-absent
extension in place with the value "masked".
  • Loading branch information
mikix committed Nov 7, 2024
1 parent 352ab34 commit 4b782f2
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 15 deletions.
27 changes: 26 additions & 1 deletion cumulus_etl/deid/scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ class SkipValue(Exception):
pass


class MaskValue(SkipValue):
pass


class Scrubber:
"""
Manages de-identification for FHIR resources.
Expand Down Expand Up @@ -160,6 +164,9 @@ def _scrub_node(
inside_extension=inside_extension,
)
)
except MaskValue:
# TODO: (not needed yet) support masking values inside array fields
self._add_data_absent_extension(node, f"_{key}")
except SkipValue:
pass

Expand Down Expand Up @@ -224,6 +231,24 @@ def _print_extension_table(self, title: str, table: ExtensionCount) -> None:
indented = rich.padding.Padding.indent(tree, 1)
rich.get_console().print(indented)

def _add_data_absent_extension(self, node: dict, parent: str) -> None:
element = node.setdefault(parent, {})
extensions = element.setdefault("extension", [])

# Check if the value is already marked as absent for any reason - leave it in place.
# (though that would be weird, since the field was present or we wouldn't be in this path)
for extension in extensions:
if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/data-absent-reason":
return

# See https://hl7.org/fhir/extensions/StructureDefinition-data-absent-reason.html
extensions.append(
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "masked",
}
)

###############################################################################
#
# Individual checkers
Expand Down Expand Up @@ -402,7 +427,7 @@ def _check_attachments(resource_type: str, node_path: str, key: str, value: Any)
and node_path == "root.content.attachment"
and key in {"data", "url"}
):
raise SkipValue
raise MaskValue

return value

Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"resourceType":"DocumentReference","id":"228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0","subject":{"reference":"Patient\/26f4d6d38eaa3347b8bd22bb4bc66ecbff5384926152738d282e841a247bfefb"},"context":{"encounter":[{"reference":"Encounter\/5388b42b262276bfbcb659b1ff937b0e3e5b0ec8901ed3ad53fa387fd6f2589f"}],"period":{"start":"2021-06-23","end":"2021-06-24"}},"type":{"coding":[{"code":"NOTE:149798455","system":"http:\/\/cumulus.smarthealthit.org\/i2b2","display":"Admission MD"}]},"status":"current","content":[{"attachment":{"contentType":"text\/plain"}}]}
{"resourceType":"DocumentReference","id":"dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588","subject":{"reference":"Patient\/49fbb06b4b49eb49a096cf2a96674fb84a4d52ee74ec25c8f6f26023cb4764a7"},"context":{"encounter":[{"reference":"Encounter\/fb29ea2a68ca2e1e4bbe22bdeedf021d94ec89f7e3d38ecbe908a8f2b3d89687"}],"period":{"start":"2021-06-24","end":"2021-06-25"}},"type":{"coding":[{"code":"NOTE:149798455","system":"http:\/\/cumulus.smarthealthit.org\/i2b2","display":"Admission MD"}]},"status":"current","content":[{"attachment":{"contentType":"text\/plain"}}]}
{"resourceType": "DocumentReference", "id": "228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0", "subject": {"reference": "Patient/26f4d6d38eaa3347b8bd22bb4bc66ecbff5384926152738d282e841a247bfefb"}, "context": {"encounter": [{"reference": "Encounter/5388b42b262276bfbcb659b1ff937b0e3e5b0ec8901ed3ad53fa387fd6f2589f"}], "period": {"start": "2021-06-23", "end": "2021-06-24"}}, "type": {"coding": [{"code": "NOTE:149798455", "system": "http://cumulus.smarthealthit.org/i2b2", "display": "Admission MD"}]}, "status": "current", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}]}
{"resourceType": "DocumentReference", "id": "dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588", "subject": {"reference": "Patient/49fbb06b4b49eb49a096cf2a96674fb84a4d52ee74ec25c8f6f26023cb4764a7"}, "context": {"encounter": [{"reference": "Encounter/fb29ea2a68ca2e1e4bbe22bdeedf021d94ec89f7e3d38ecbe908a8f2b3d89687"}], "period": {"start": "2021-06-24", "end": "2021-06-25"}}, "type": {"coding": [{"code": "NOTE:149798455", "system": "http://cumulus.smarthealthit.org/i2b2", "display": "Admission MD"}]}, "status": "current", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}]}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/d30aad4b-4503-8e22-0bc4-621b94398520"}],"period":{"end":"2021-06-24","start":"2021-06-23"}},"status":"current","subject":{"reference":"Patient\/118dc10e-7745-20d7-e98d-7c358a84c15c"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"}
{"resourceType": "DocumentReference", "id": "f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/d30aad4b-4503-8e22-0bc4-621b94398520"}], "period": {"end": "2021-06-24", "start": "2021-06-23"}}, "status": "current", "subject": {"reference": "Patient/118dc10e-7745-20d7-e98d-7c358a84c15c"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}],"period":{"end":"2021-06-25","start":"2021-06-24"}},"status":"current","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"}
{"resourceType": "DocumentReference", "id": "c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}], "period": {"end": "2021-06-25", "start": "2021-06-24"}}, "status": "current", "subject": {"reference": "Patient/1de9ea66-70d3-da1f-c735-df5ef7697fb9"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}}
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/d30aad4b-4503-8e22-0bc4-621b94398520"}],"period":{"end":"2021-06-24","start":"2021-06-23"}},"status":"current","subject":{"reference":"Patient\/118dc10e-7745-20d7-e98d-7c358a84c15c"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"}
{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}],"period":{"end":"2021-06-25","start":"2021-06-24"}},"status":"current","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"}
{"resourceType": "DocumentReference", "id": "f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/d30aad4b-4503-8e22-0bc4-621b94398520"}], "period": {"end": "2021-06-24", "start": "2021-06-23"}}, "status": "current", "subject": {"reference": "Patient/118dc10e-7745-20d7-e98d-7c358a84c15c"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}}
{"resourceType": "DocumentReference", "id": "c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}], "period": {"end": "2021-06-25", "start": "2021-06-24"}}, "status": "current", "subject": {"reference": "Patient/1de9ea66-70d3-da1f-c735-df5ef7697fb9"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}}
80 changes: 72 additions & 8 deletions tests/deid/test_deid_scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,36 @@ def test_condition(self):

def test_documentreference(self):
"""Test DocumentReference, which is interesting because of its list of encounters and attachments"""
docref = i2b2_mock_data.documentreference()
self.assertEqual("345", docref["id"])
self.assertEqual("Patient/12345", docref["subject"]["reference"])
self.assertEqual(1, len(docref["context"]["encounter"]))
self.assertEqual("Encounter/67890", docref["context"]["encounter"][0]["reference"])
self.assertEqual(1, len(docref["content"]))
self.assertIsNotNone(docref["content"][0]["attachment"]["data"])
docref = {
"resourceType": "DocumentReference",
"id": "345",
"subject": {"reference": "Patient/12345"},
"context": {
"encounter": [{"reference": "Encounter/67890"}],
},
"content": [
{
"attachment": {
"data": "aGVsbG8gd29ybGQ=",
"url": "https://example.com/hello-world",
},
},
{
"attachment": {
"data": "xxx",
"_data": {
"extension": [
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "error",
}
],
},
"url": "https://example.com/hello-world",
},
},
],
}

scrubber = Scrubber()
self.assertTrue(scrubber.scrub_resource(docref))
Expand All @@ -77,7 +100,48 @@ def test_documentreference(self):
docref["context"]["encounter"][0]["reference"],
f"Encounter/{scrubber.codebook.fake_id('Encounter', '67890')}",
)
self.assertNotIn("data", docref["content"][0]["attachment"])
self.assertEqual(
docref["content"][0]["attachment"],
{
"_data": {
"extension": [
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "masked",
}
]
},
"_url": {
"extension": [
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "masked",
}
]
},
},
)
self.assertEqual(
docref["content"][1]["attachment"],
{
"_data": {
"extension": [
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "error", # we left this reason in place
}
]
},
"_url": {
"extension": [
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "masked",
}
]
},
},
)

def test_contained_reference(self):
"""Verify that we leave contained references contained but scrubbed"""
Expand Down

0 comments on commit 4b782f2

Please sign in to comment.