From ecf2d9a668d0fb6210ab929a30356093a0112dcf Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Wed, 25 Oct 2023 10:51:52 -0400
Subject: [PATCH] feat: add ability to fold in external annotations

This adds new config syntax like:

annotators:
  icd10:
    filename: icd.csv

And then we'll read symptoms from your CSV file and pretend there
is a new annotator with the name 'icd10'
---
 README.md                                   | 19 ++++++
 chart_review/cohort.py                      |  5 ++
 chart_review/config.py                      |  9 ++-
 chart_review/covid_symptom/config.yaml      | 10 +--
 chart_review/external.py                    | 71 +++++++++++++++++++++
 tests/data/external/config.yaml             |  4 ++
 tests/data/external/icd.csv                 |  5 ++
 tests/data/external/labelstudio-export.json | 34 ++++++++++
 tests/test_external.py                      | 34 ++++++++++
 9 files changed, 184 insertions(+), 7 deletions(-)
 create mode 100644 chart_review/external.py
 create mode 100644 tests/data/external/config.yaml
 create mode 100644 tests/data/external/icd.csv
 create mode 100644 tests/data/external/labelstudio-export.json
 create mode 100644 tests/test_external.py

diff --git a/README.md b/README.md
index 941da11..0e6f9b2 100644
--- a/README.md
+++ b/README.md
@@ -89,6 +89,25 @@ Pass `--help` to see more options.
 * `annotator1_vs_2: [list, of, notes]`
 * `annotator2_vs_3: corpus`
 
+#### External Annotations
+
+You may have annotations from NLP or coded FHIR data that you want to compare against.
+Easy!
+
+Set up your config to point at a CSV file in your project folder that holds two columns:
+- DocRef ID (real or anonymous)
+- Label
+
+```yaml
+annotators:
+  human: 1
+  external_nlp:
+    filename: my_nlp.csv
+```
+
+When `chart-review` runs, it will inject the external annotations and match up the DocRef IDs
+to Label Studio notes based on metadata in your Label Studio export.
+
 ---
 **BASE COHORT METHODS**
 
diff --git a/chart_review/cohort.py b/chart_review/cohort.py
index 4620e4d..be592df 100644
--- a/chart_review/cohort.py
+++ b/chart_review/cohort.py
@@ -3,6 +3,7 @@
 from chart_review.common import guard_str, guard_iter, guard_in
 from chart_review import common
 from chart_review import config
+from chart_review import external
 from chart_review import simplify
 from chart_review import mentions
 from chart_review import agree
@@ -33,6 +34,10 @@ def __init__(self, project_dir: str):
                 compat['annotations'][int(k)] = saved['annotations'][k]
             self.annotations = compat
 
+        # Load external annotations (i.e. from NLP tags or ICD10 codes)
+        for name, value in self.config.external_annotations.items():
+            self.annotations = external.merge_external(self.annotations, saved, project_dir, name, value)
+
     def path(self, filename):
         return os.path.join(self.project_dir, filename)
 
diff --git a/chart_review/config.py b/chart_review/config.py
index 6833329..410805f 100644
--- a/chart_review/config.py
+++ b/chart_review/config.py
@@ -36,8 +36,13 @@ def __init__(self, project_dir: str):
         # is stored in Label Studio. So that's what we return from this method.
         # But as humans writing config files, it's more natural to think of "name -> id".
         # So that's what we keep in the config, and we just reverse it here for convenience.
-        orig_annotators = self._data.get("annotators", {})
-        self.annotators = dict(map(reversed, orig_annotators.items()))
+        self.annotators: dict[int, str] = {}
+        self.external_annotations = {}
+        for name, value in self._data.get("annotators", {}).items():
+            if isinstance(value, int):  # real annotation layer in Label Studio
+                self.annotators[value] = name
+            else:  # fake/external annotation layer that we will inject
+                self.external_annotations[name] = value
 
         ### Note ranges
         # Handle some extra syntax like 1-3 == [1, 2, 3]
diff --git a/chart_review/covid_symptom/config.yaml b/chart_review/covid_symptom/config.yaml
index ab98a0c..43158e7 100644
--- a/chart_review/covid_symptom/config.yaml
+++ b/chart_review/covid_symptom/config.yaml
@@ -15,11 +15,11 @@ labels:
   - Sore throat
 
 annotators:
-  andy = 2
-  amy = 3
-  alon = 6
-  ctakes = 7  # mcmurry.andy
-  icd10 = 0
+  andy: 2
+  amy: 3
+  alon: 6
+  ctakes: 7  # mcmurry.andy
+  icd10: 0
 
 ranges:
   corpus: 782-1006
diff --git a/chart_review/external.py b/chart_review/external.py
new file mode 100644
index 0000000..fbfd0c4
--- /dev/null
+++ b/chart_review/external.py
@@ -0,0 +1,71 @@
+"""Match external document references & symptoms to Label Studio data"""
+
+import csv
+import os
+import sys
+
+from chart_review import simplify
+
+
+def _load_csv_symptoms(filename: str) -> dict[str, list[str]]:
+    """
+    Loads a csv and returns a list of symptoms per docref.
+
+    CSV format is two columns, where the first is docref id and the second is a single symptom.
+    Returns docref_id -> list of symptoms for that ID
+    """
+    docref_to_symptoms = {}
+
+    with open(filename, "r", newline='', encoding="utf8") as csvfile:
+        reader = csv.reader(csvfile)
+        next(reader, None)  # skip header row
+        for row in reader: # row should be [docref_id, symptom]
+            docref_id = row[0]
+            symptom_list = docref_to_symptoms.setdefault(docref_id, [])
+            symptom_list.append(row[1])
+
+    return docref_to_symptoms
+
+
+def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) -> int | None:
+    """Looks at the metadata in Label Studio and grabs the note ID that holds the provided docref"""
+    for row in exported_json:
+        mappings = row.get("data", {}).get("docref_mappings", {})
+        for key, value in mappings.items():
+            # Allow either an anonymous ID or the real ID -- collisions seem very unlikely
+            # (i.e. real IDs aren't going to be formatted like our long anonymous ID hash)
+            if key == docref_id or value == docref_id:
+                return int(row["id"])
+    return None
+
+
+def merge_external(simple: dict, exported_json: list[dict], project_dir: str, name: str, config: dict) -> dict:
+    """Loads an external csv file annotator and merges them into an existing simple dict"""
+    if filename := config.get("filename"):
+        full_filename = os.path.join(project_dir, filename)
+        symptom_map = _load_csv_symptoms(full_filename)
+    else:
+        sys.exit(f"Did not understand config for external annotator '{name}'")
+
+    # Inspect exported json to see if it has the metadata we'll need.
+    for row in exported_json:
+        if "docref_mappings" not in row.get("data", {}):
+            sys.exit(
+                f"Your Label Studio export does not include DocRef ID mapping metadata!\n"
+                f"Consider re-uploading your notes using Cumulus ETL's chart-review command."
+            )
+        break  # just inspect one
+
+    # Convert each docref_id into an LS id:
+    external_simple = {"files": {}, "annotations": {}}
+    for docref_id, symptom_list in symptom_map.items():
+        ls_id = _docref_id_to_label_studio_id(exported_json, docref_id)
+        if ls_id is None:
+            continue
+
+        external_simple["files"][ls_id] = ls_id
+        annotation_list = external_simple["annotations"].setdefault(ls_id, {}).setdefault(name, [])
+        annotation_list.append({"labels": symptom_list})
+
+    # Merge into existing simple dictionary
+    return simplify.merge_simple(simple, external_simple)
diff --git a/tests/data/external/config.yaml b/tests/data/external/config.yaml
new file mode 100644
index 0000000..f8e1d1e
--- /dev/null
+++ b/tests/data/external/config.yaml
@@ -0,0 +1,4 @@
+annotators:
+  human: 1
+  icd10:
+    filename: icd.csv
diff --git a/tests/data/external/icd.csv b/tests/data/external/icd.csv
new file mode 100644
index 0000000..8014c9e
--- /dev/null
+++ b/tests/data/external/icd.csv
@@ -0,0 +1,5 @@
+docref_id,symptom
+"ABC","happy"
+"ABC","tired"
+"Anon-ABC","hungry"
+"Unmatched","lost"
diff --git a/tests/data/external/labelstudio-export.json b/tests/data/external/labelstudio-export.json
new file mode 100644
index 0000000..5fba0b9
--- /dev/null
+++ b/tests/data/external/labelstudio-export.json
@@ -0,0 +1,34 @@
+[
+  {
+    "id": 1,
+    "annotations": [
+      {
+        "id": 101,
+        "completed_by": 1,
+        "result": [
+          {
+            "value": {
+              "text": "woo",
+              "labels": [
+                "happy"
+              ]
+            }
+          },
+          {
+            "value": {
+              "text": "sigh",
+              "labels": [
+                "sad"
+              ]
+            }
+          }
+        ]
+      }
+    ],
+    "data": {
+      "docref_mappings": {
+        "ABC": "Anon-ABC"
+      }
+    }
+  }
+]
\ No newline at end of file
diff --git a/tests/test_external.py b/tests/test_external.py
new file mode 100644
index 0000000..69bb17c
--- /dev/null
+++ b/tests/test_external.py
@@ -0,0 +1,34 @@
+"""Tests for external.py"""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from chart_review import cohort
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+
+
+class TestExternal(unittest.TestCase):
+    """Test case for basic external ID merging"""
+
+    def setUp(self):
+        super().setUp()
+        self.maxDiff = None
+
+    def test_basic_read(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            shutil.copytree(f"{DATA_DIR}/external", tmpdir, dirs_exist_ok=True)
+            reader = cohort.CohortReader(tmpdir)
+
+            self.assertEqual({
+                "files": {1: 1},
+                "annotations": {
+                    1: {
+                        "human": [{"labels": ["happy"], "text": "woo"}, {"labels": ["sad"], "text": "sigh"}],
+                        # icd10 labels are split into two lists, because we used two different docrefs (anon & real)
+                        "icd10": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}],
+                    },
+                }
+            }, reader.annotations)