From ecf2d9a668d0fb6210ab929a30356093a0112dcf Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Wed, 25 Oct 2023 10:51:52 -0400 Subject: [PATCH] feat: add ability to fold in external annotations This adds new config syntax like: annotators: icd10: filename: icd.csv And then we'll read symptoms from your CSV file and pretend there is a new annotator with the name 'icd10' --- README.md | 19 ++++++ chart_review/cohort.py | 5 ++ chart_review/config.py | 9 ++- chart_review/covid_symptom/config.yaml | 10 +-- chart_review/external.py | 71 +++++++++++++++++++++ tests/data/external/config.yaml | 4 ++ tests/data/external/icd.csv | 5 ++ tests/data/external/labelstudio-export.json | 34 ++++++++++ tests/test_external.py | 34 ++++++++++ 9 files changed, 184 insertions(+), 7 deletions(-) create mode 100644 chart_review/external.py create mode 100644 tests/data/external/config.yaml create mode 100644 tests/data/external/icd.csv create mode 100644 tests/data/external/labelstudio-export.json create mode 100644 tests/test_external.py diff --git a/README.md b/README.md index 941da11..0e6f9b2 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,25 @@ Pass `--help` to see more options. * `annotator1_vs_2: [list, of, notes]` * `annotator2_vs_3: corpus` +#### External Annotations + +You may have annotations from NLP or coded FHIR data that you want to compare against. +Easy! + +Set up your config to point at a CSV file in your project folder that holds two columns: +- DocRef ID (real or anonymous) +- Label + +```yaml +annotators: + human: 1 + external_nlp: + filename: my_nlp.csv +``` + +When `chart-review` runs, it will inject the external annotations and match up the DocRef IDs +to Label Studio notes based on metadata in your Label Studio export. + --- **BASE COHORT METHODS** diff --git a/chart_review/cohort.py b/chart_review/cohort.py index 4620e4d..be592df 100644 --- a/chart_review/cohort.py +++ b/chart_review/cohort.py @@ -3,6 +3,7 @@ from chart_review.common import guard_str, guard_iter, guard_in from chart_review import common from chart_review import config +from chart_review import external from chart_review import simplify from chart_review import mentions from chart_review import agree @@ -33,6 +34,10 @@ def __init__(self, project_dir: str): compat['annotations'][int(k)] = saved['annotations'][k] self.annotations = compat + # Load external annotations (i.e. from NLP tags or ICD10 codes) + for name, value in self.config.external_annotations.items(): + self.annotations = external.merge_external(self.annotations, saved, project_dir, name, value) + def path(self, filename): return os.path.join(self.project_dir, filename) diff --git a/chart_review/config.py b/chart_review/config.py index 6833329..410805f 100644 --- a/chart_review/config.py +++ b/chart_review/config.py @@ -36,8 +36,13 @@ def __init__(self, project_dir: str): # is stored in Label Studio. So that's what we return from this method. # But as humans writing config files, it's more natural to think of "name -> id". # So that's what we keep in the config, and we just reverse it here for convenience. - orig_annotators = self._data.get("annotators", {}) - self.annotators = dict(map(reversed, orig_annotators.items())) + self.annotators: dict[int, str] = {} + self.external_annotations = {} + for name, value in self._data.get("annotators", {}).items(): + if isinstance(value, int): # real annotation layer in Label Studio + self.annotators[value] = name + else: # fake/external annotation layer that we will inject + self.external_annotations[name] = value ### Note ranges # Handle some extra syntax like 1-3 == [1, 2, 3] diff --git a/chart_review/covid_symptom/config.yaml b/chart_review/covid_symptom/config.yaml index ab98a0c..43158e7 100644 --- a/chart_review/covid_symptom/config.yaml +++ b/chart_review/covid_symptom/config.yaml @@ -15,11 +15,11 @@ labels: - Sore throat annotators: - andy = 2 - amy = 3 - alon = 6 - ctakes = 7 # mcmurry.andy - icd10 = 0 + andy: 2 + amy: 3 + alon: 6 + ctakes: 7 # mcmurry.andy + icd10: 0 ranges: corpus: 782-1006 diff --git a/chart_review/external.py b/chart_review/external.py new file mode 100644 index 0000000..fbfd0c4 --- /dev/null +++ b/chart_review/external.py @@ -0,0 +1,71 @@ +"""Match external document references & symptoms to Label Studio data""" + +import csv +import os +import sys + +from chart_review import simplify + + +def _load_csv_symptoms(filename: str) -> dict[str, list[str]]: + """ + Loads a csv and returns a list of symptoms per docref. + + CSV format is two columns, where the first is docref id and the second is a single symptom. + Returns docref_id -> list of symptoms for that ID + """ + docref_to_symptoms = {} + + with open(filename, "r", newline='', encoding="utf8") as csvfile: + reader = csv.reader(csvfile) + next(reader, None) # skip header row + for row in reader: # row should be [docref_id, symptom] + docref_id = row[0] + symptom_list = docref_to_symptoms.setdefault(docref_id, []) + symptom_list.append(row[1]) + + return docref_to_symptoms + + +def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) -> int | None: + """Looks at the metadata in Label Studio and grabs the note ID that holds the provided docref""" + for row in exported_json: + mappings = row.get("data", {}).get("docref_mappings", {}) + for key, value in mappings.items(): + # Allow either an anonymous ID or the real ID -- collisions seem very unlikely + # (i.e. real IDs aren't going to be formatted like our long anonymous ID hash) + if key == docref_id or value == docref_id: + return int(row["id"]) + return None + + +def merge_external(simple: dict, exported_json: list[dict], project_dir: str, name: str, config: dict) -> dict: + """Loads an external csv file annotator and merges them into an existing simple dict""" + if filename := config.get("filename"): + full_filename = os.path.join(project_dir, filename) + symptom_map = _load_csv_symptoms(full_filename) + else: + sys.exit(f"Did not understand config for external annotator '{name}'") + + # Inspect exported json to see if it has the metadata we'll need. + for row in exported_json: + if "docref_mappings" not in row.get("data", {}): + sys.exit( + f"Your Label Studio export does not include DocRef ID mapping metadata!\n" + f"Consider re-uploading your notes using Cumulus ETL's chart-review command." + ) + break # just inspect one + + # Convert each docref_id into an LS id: + external_simple = {"files": {}, "annotations": {}} + for docref_id, symptom_list in symptom_map.items(): + ls_id = _docref_id_to_label_studio_id(exported_json, docref_id) + if ls_id is None: + continue + + external_simple["files"][ls_id] = ls_id + annotation_list = external_simple["annotations"].setdefault(ls_id, {}).setdefault(name, []) + annotation_list.append({"labels": symptom_list}) + + # Merge into existing simple dictionary + return simplify.merge_simple(simple, external_simple) diff --git a/tests/data/external/config.yaml b/tests/data/external/config.yaml new file mode 100644 index 0000000..f8e1d1e --- /dev/null +++ b/tests/data/external/config.yaml @@ -0,0 +1,4 @@ +annotators: + human: 1 + icd10: + filename: icd.csv diff --git a/tests/data/external/icd.csv b/tests/data/external/icd.csv new file mode 100644 index 0000000..8014c9e --- /dev/null +++ b/tests/data/external/icd.csv @@ -0,0 +1,5 @@ +docref_id,symptom +"ABC","happy" +"ABC","tired" +"Anon-ABC","hungry" +"Unmatched","lost" diff --git a/tests/data/external/labelstudio-export.json b/tests/data/external/labelstudio-export.json new file mode 100644 index 0000000..5fba0b9 --- /dev/null +++ b/tests/data/external/labelstudio-export.json @@ -0,0 +1,34 @@ +[ + { + "id": 1, + "annotations": [ + { + "id": 101, + "completed_by": 1, + "result": [ + { + "value": { + "text": "woo", + "labels": [ + "happy" + ] + } + }, + { + "value": { + "text": "sigh", + "labels": [ + "sad" + ] + } + } + ] + } + ], + "data": { + "docref_mappings": { + "ABC": "Anon-ABC" + } + } + } +] \ No newline at end of file diff --git a/tests/test_external.py b/tests/test_external.py new file mode 100644 index 0000000..69bb17c --- /dev/null +++ b/tests/test_external.py @@ -0,0 +1,34 @@ +"""Tests for external.py""" + +import os +import shutil +import tempfile +import unittest + +from chart_review import cohort + +DATA_DIR = os.path.join(os.path.dirname(__file__), "data") + + +class TestExternal(unittest.TestCase): + """Test case for basic external ID merging""" + + def setUp(self): + super().setUp() + self.maxDiff = None + + def test_basic_read(self): + with tempfile.TemporaryDirectory() as tmpdir: + shutil.copytree(f"{DATA_DIR}/external", tmpdir, dirs_exist_ok=True) + reader = cohort.CohortReader(tmpdir) + + self.assertEqual({ + "files": {1: 1}, + "annotations": { + 1: { + "human": [{"labels": ["happy"], "text": "woo"}, {"labels": ["sad"], "text": "sigh"}], + # icd10 labels are split into two lists, because we used two different docrefs (anon & real) + "icd10": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}], + }, + } + }, reader.annotations)