Merge pull request #3 from smart-on-fhir/mikix/custom-layers

feat: add ability to fold in external annotations
smart-on-fhir · Oct 26, 2023 · baa8361 · baa8361
2 parents 8aab0c2 + ecf2d9a
commit baa8361
Show file tree

Hide file tree

Showing 9 changed files with 184 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -89,6 +89,25 @@ Pass `--help` to see more options.
 * `annotator1_vs_2: [list, of, notes]`
 * `annotator2_vs_3: corpus`
 
+#### External Annotations
+
+You may have annotations from NLP or coded FHIR data that you want to compare against.
+Easy!
+
+Set up your config to point at a CSV file in your project folder that holds two columns:
+- DocRef ID (real or anonymous)
+- Label
+
+```yaml
+annotators:
+  human: 1
+  external_nlp:
+    filename: my_nlp.csv
+```
+
+When `chart-review` runs, it will inject the external annotations and match up the DocRef IDs
+to Label Studio notes based on metadata in your Label Studio export.
+
 ---
 **BASE COHORT METHODS**
 

diff --git a/chart_review/cohort.py b/chart_review/cohort.py
@@ -3,6 +3,7 @@
 from chart_review.common import guard_str, guard_iter, guard_in
 from chart_review import common
 from chart_review import config
+from chart_review import external
 from chart_review import simplify
 from chart_review import mentions
 from chart_review import agree
@@ -33,6 +34,10 @@ def __init__(self, project_dir: str):
                 compat['annotations'][int(k)] = saved['annotations'][k]
             self.annotations = compat
 
+        # Load external annotations (i.e. from NLP tags or ICD10 codes)
+        for name, value in self.config.external_annotations.items():
+            self.annotations = external.merge_external(self.annotations, saved, project_dir, name, value)
+
     def path(self, filename):
         return os.path.join(self.project_dir, filename)
 

diff --git a/chart_review/config.py b/chart_review/config.py
@@ -36,8 +36,13 @@ def __init__(self, project_dir: str):
         # is stored in Label Studio. So that's what we return from this method.
         # But as humans writing config files, it's more natural to think of "name -> id".
         # So that's what we keep in the config, and we just reverse it here for convenience.
-        orig_annotators = self._data.get("annotators", {})
-        self.annotators = dict(map(reversed, orig_annotators.items()))
+        self.annotators: dict[int, str] = {}
+        self.external_annotations = {}
+        for name, value in self._data.get("annotators", {}).items():
+            if isinstance(value, int):  # real annotation layer in Label Studio
+                self.annotators[value] = name
+            else:  # fake/external annotation layer that we will inject
+                self.external_annotations[name] = value
 
         ### Note ranges
         # Handle some extra syntax like 1-3 == [1, 2, 3]

diff --git a/chart_review/covid_symptom/config.yaml b/chart_review/covid_symptom/config.yaml
@@ -15,11 +15,11 @@ labels:
   - Sore throat
 
 annotators:
-  andy = 2
-  amy = 3
-  alon = 6
-  ctakes = 7  # mcmurry.andy
-  icd10 = 0
+  andy: 2
+  amy: 3
+  alon: 6
+  ctakes: 7  # mcmurry.andy
+  icd10: 0
 
 ranges:
   corpus: 782-1006

diff --git a/chart_review/external.py b/chart_review/external.py
@@ -0,0 +1,71 @@
+"""Match external document references & symptoms to Label Studio data"""
+
+import csv
+import os
+import sys
+
+from chart_review import simplify
+
+
+def _load_csv_symptoms(filename: str) -> dict[str, list[str]]:
+    """
+    Loads a csv and returns a list of symptoms per docref.
+
+    CSV format is two columns, where the first is docref id and the second is a single symptom.
+    Returns docref_id -> list of symptoms for that ID
+    """
+    docref_to_symptoms = {}
+
+    with open(filename, "r", newline='', encoding="utf8") as csvfile:
+        reader = csv.reader(csvfile)
+        next(reader, None)  # skip header row
+        for row in reader: # row should be [docref_id, symptom]
+            docref_id = row[0]
+            symptom_list = docref_to_symptoms.setdefault(docref_id, [])
+            symptom_list.append(row[1])
+
+    return docref_to_symptoms
+
+
+def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) -> int | None:
+    """Looks at the metadata in Label Studio and grabs the note ID that holds the provided docref"""
+    for row in exported_json:
+        mappings = row.get("data", {}).get("docref_mappings", {})
+        for key, value in mappings.items():
+            # Allow either an anonymous ID or the real ID -- collisions seem very unlikely
+            # (i.e. real IDs aren't going to be formatted like our long anonymous ID hash)
+            if key == docref_id or value == docref_id:
+                return int(row["id"])
+    return None
+
+
+def merge_external(simple: dict, exported_json: list[dict], project_dir: str, name: str, config: dict) -> dict:
+    """Loads an external csv file annotator and merges them into an existing simple dict"""
+    if filename := config.get("filename"):
+        full_filename = os.path.join(project_dir, filename)
+        symptom_map = _load_csv_symptoms(full_filename)
+    else:
+        sys.exit(f"Did not understand config for external annotator '{name}'")
+
+    # Inspect exported json to see if it has the metadata we'll need.
+    for row in exported_json:
+        if "docref_mappings" not in row.get("data", {}):
+            sys.exit(
+                f"Your Label Studio export does not include DocRef ID mapping metadata!\n"
+                f"Consider re-uploading your notes using Cumulus ETL's chart-review command."
+            )
+        break  # just inspect one
+
+    # Convert each docref_id into an LS id:
+    external_simple = {"files": {}, "annotations": {}}
+    for docref_id, symptom_list in symptom_map.items():
+        ls_id = _docref_id_to_label_studio_id(exported_json, docref_id)
+        if ls_id is None:
+            continue
+
+        external_simple["files"][ls_id] = ls_id
+        annotation_list = external_simple["annotations"].setdefault(ls_id, {}).setdefault(name, [])
+        annotation_list.append({"labels": symptom_list})
+
+    # Merge into existing simple dictionary
+    return simplify.merge_simple(simple, external_simple)
diff --git a/tests/data/external/config.yaml b/tests/data/external/config.yaml
@@ -0,0 +1,4 @@
+annotators:
+  human: 1
+  icd10:
+    filename: icd.csv
diff --git a/tests/data/external/icd.csv b/tests/data/external/icd.csv
@@ -0,0 +1,5 @@
+docref_id,symptom
+"ABC","happy"
+"ABC","tired"
+"Anon-ABC","hungry"
+"Unmatched","lost"
diff --git a/tests/data/external/labelstudio-export.json b/tests/data/external/labelstudio-export.json
@@ -0,0 +1,34 @@
+[
+  {
+    "id": 1,
+    "annotations": [
+      {
+        "id": 101,
+        "completed_by": 1,
+        "result": [
+          {
+            "value": {
+              "text": "woo",
+              "labels": [
+                "happy"
+              ]
+            }
+          },
+          {
+            "value": {
+              "text": "sigh",
+              "labels": [
+                "sad"
+              ]
+            }
+          }
+        ]
+      }
+    ],
+    "data": {
+      "docref_mappings": {
+        "ABC": "Anon-ABC"
+      }
+    }
+  }
+]
diff --git a/tests/test_external.py b/tests/test_external.py
@@ -0,0 +1,34 @@
+"""Tests for external.py"""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from chart_review import cohort
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+
+
+class TestExternal(unittest.TestCase):
+    """Test case for basic external ID merging"""
+
+    def setUp(self):
+        super().setUp()
+        self.maxDiff = None
+
+    def test_basic_read(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            shutil.copytree(f"{DATA_DIR}/external", tmpdir, dirs_exist_ok=True)
+            reader = cohort.CohortReader(tmpdir)
+
+            self.assertEqual({
+                "files": {1: 1},
+                "annotations": {
+                    1: {
+                        "human": [{"labels": ["happy"], "text": "woo"}, {"labels": ["sad"], "text": "sigh"}],
+                        # icd10 labels are split into two lists, because we used two different docrefs (anon & real)
+                        "icd10": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}],
+                    },
+                }
+            }, reader.annotations)