Skip to content

Commit

Permalink
Merge pull request #3 from smart-on-fhir/mikix/custom-layers
Browse files Browse the repository at this point in the history
feat: add ability to fold in external annotations
  • Loading branch information
mikix authored Oct 26, 2023
2 parents 8aab0c2 + ecf2d9a commit baa8361
Show file tree
Hide file tree
Showing 9 changed files with 184 additions and 7 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,25 @@ Pass `--help` to see more options.
* `annotator1_vs_2: [list, of, notes]`
* `annotator2_vs_3: corpus`

#### External Annotations

You may have annotations from NLP or coded FHIR data that you want to compare against.
Easy!

Set up your config to point at a CSV file in your project folder that holds two columns:
- DocRef ID (real or anonymous)
- Label

```yaml
annotators:
human: 1
external_nlp:
filename: my_nlp.csv
```

When `chart-review` runs, it will inject the external annotations and match up the DocRef IDs
to Label Studio notes based on metadata in your Label Studio export.

---
**BASE COHORT METHODS**

Expand Down
5 changes: 5 additions & 0 deletions chart_review/cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from chart_review.common import guard_str, guard_iter, guard_in
from chart_review import common
from chart_review import config
from chart_review import external
from chart_review import simplify
from chart_review import mentions
from chart_review import agree
Expand Down Expand Up @@ -33,6 +34,10 @@ def __init__(self, project_dir: str):
compat['annotations'][int(k)] = saved['annotations'][k]
self.annotations = compat

# Load external annotations (i.e. from NLP tags or ICD10 codes)
for name, value in self.config.external_annotations.items():
self.annotations = external.merge_external(self.annotations, saved, project_dir, name, value)

def path(self, filename):
return os.path.join(self.project_dir, filename)

Expand Down
9 changes: 7 additions & 2 deletions chart_review/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,13 @@ def __init__(self, project_dir: str):
# is stored in Label Studio. So that's what we return from this method.
# But as humans writing config files, it's more natural to think of "name -> id".
# So that's what we keep in the config, and we just reverse it here for convenience.
orig_annotators = self._data.get("annotators", {})
self.annotators = dict(map(reversed, orig_annotators.items()))
self.annotators: dict[int, str] = {}
self.external_annotations = {}
for name, value in self._data.get("annotators", {}).items():
if isinstance(value, int): # real annotation layer in Label Studio
self.annotators[value] = name
else: # fake/external annotation layer that we will inject
self.external_annotations[name] = value

### Note ranges
# Handle some extra syntax like 1-3 == [1, 2, 3]
Expand Down
10 changes: 5 additions & 5 deletions chart_review/covid_symptom/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ labels:
- Sore throat

annotators:
andy = 2
amy = 3
alon = 6
ctakes = 7 # mcmurry.andy
icd10 = 0
andy: 2
amy: 3
alon: 6
ctakes: 7 # mcmurry.andy
icd10: 0

ranges:
corpus: 782-1006
Expand Down
71 changes: 71 additions & 0 deletions chart_review/external.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Match external document references & symptoms to Label Studio data"""

import csv
import os
import sys

from chart_review import simplify


def _load_csv_symptoms(filename: str) -> dict[str, list[str]]:
"""
Loads a csv and returns a list of symptoms per docref.
CSV format is two columns, where the first is docref id and the second is a single symptom.
Returns docref_id -> list of symptoms for that ID
"""
docref_to_symptoms = {}

with open(filename, "r", newline='', encoding="utf8") as csvfile:
reader = csv.reader(csvfile)
next(reader, None) # skip header row
for row in reader: # row should be [docref_id, symptom]
docref_id = row[0]
symptom_list = docref_to_symptoms.setdefault(docref_id, [])
symptom_list.append(row[1])

return docref_to_symptoms


def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) -> int | None:
"""Looks at the metadata in Label Studio and grabs the note ID that holds the provided docref"""
for row in exported_json:
mappings = row.get("data", {}).get("docref_mappings", {})
for key, value in mappings.items():
# Allow either an anonymous ID or the real ID -- collisions seem very unlikely
# (i.e. real IDs aren't going to be formatted like our long anonymous ID hash)
if key == docref_id or value == docref_id:
return int(row["id"])
return None


def merge_external(simple: dict, exported_json: list[dict], project_dir: str, name: str, config: dict) -> dict:
"""Loads an external csv file annotator and merges them into an existing simple dict"""
if filename := config.get("filename"):
full_filename = os.path.join(project_dir, filename)
symptom_map = _load_csv_symptoms(full_filename)
else:
sys.exit(f"Did not understand config for external annotator '{name}'")

# Inspect exported json to see if it has the metadata we'll need.
for row in exported_json:
if "docref_mappings" not in row.get("data", {}):
sys.exit(
f"Your Label Studio export does not include DocRef ID mapping metadata!\n"
f"Consider re-uploading your notes using Cumulus ETL's chart-review command."
)
break # just inspect one

# Convert each docref_id into an LS id:
external_simple = {"files": {}, "annotations": {}}
for docref_id, symptom_list in symptom_map.items():
ls_id = _docref_id_to_label_studio_id(exported_json, docref_id)
if ls_id is None:
continue

external_simple["files"][ls_id] = ls_id
annotation_list = external_simple["annotations"].setdefault(ls_id, {}).setdefault(name, [])
annotation_list.append({"labels": symptom_list})

# Merge into existing simple dictionary
return simplify.merge_simple(simple, external_simple)
4 changes: 4 additions & 0 deletions tests/data/external/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
annotators:
human: 1
icd10:
filename: icd.csv
5 changes: 5 additions & 0 deletions tests/data/external/icd.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
docref_id,symptom
"ABC","happy"
"ABC","tired"
"Anon-ABC","hungry"
"Unmatched","lost"
34 changes: 34 additions & 0 deletions tests/data/external/labelstudio-export.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
[
{
"id": 1,
"annotations": [
{
"id": 101,
"completed_by": 1,
"result": [
{
"value": {
"text": "woo",
"labels": [
"happy"
]
}
},
{
"value": {
"text": "sigh",
"labels": [
"sad"
]
}
}
]
}
],
"data": {
"docref_mappings": {
"ABC": "Anon-ABC"
}
}
}
]
34 changes: 34 additions & 0 deletions tests/test_external.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Tests for external.py"""

import os
import shutil
import tempfile
import unittest

from chart_review import cohort

DATA_DIR = os.path.join(os.path.dirname(__file__), "data")


class TestExternal(unittest.TestCase):
"""Test case for basic external ID merging"""

def setUp(self):
super().setUp()
self.maxDiff = None

def test_basic_read(self):
with tempfile.TemporaryDirectory() as tmpdir:
shutil.copytree(f"{DATA_DIR}/external", tmpdir, dirs_exist_ok=True)
reader = cohort.CohortReader(tmpdir)

self.assertEqual({
"files": {1: 1},
"annotations": {
1: {
"human": [{"labels": ["happy"], "text": "woo"}, {"labels": ["sad"], "text": "sigh"}],
# icd10 labels are split into two lists, because we used two different docrefs (anon & real)
"icd10": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}],
},
}
}, reader.annotations)

0 comments on commit baa8361

Please sign in to comment.