Merge pull request #37 from smart-on-fhir/mikix/info-ids

feat(info): add --ids command to print ID mappings
smart-on-fhir · Jun 7, 2024 · 2138b29 · 2138b29
2 parents f31f098 + 66a47fb
commit 2138b29
Show file tree

Hide file tree

Showing 8 changed files with 326 additions and 113 deletions.
diff --git a/chart_review/cli.py b/chart_review/cli.py
@@ -3,92 +3,20 @@
 import argparse
 import sys
 
-from chart_review import cohort, config
-from chart_review.commands.accuracy import accuracy
-from chart_review.commands.info import info
-
-
-###############################################################################
-#
-# CLI Helpers
-#
-###############################################################################
-
-
-def add_project_args(parser: argparse.ArgumentParser) -> None:
-    parser.add_argument(
-        "--project-dir",
-        default=".",
-        metavar="DIR",
-        help=(
-            "Directory holding project files, "
-            "like labelstudio-export.json (default: current dir)"
-        ),
-    )
-    parser.add_argument(
-        "--config", "-c", metavar="PATH", help="Config file (default: [project-dir]/config.yaml)"
-    )
+from chart_review.commands import accuracy, info
 
 
 def define_parser() -> argparse.ArgumentParser:
     """Fills out an argument parser with all the CLI options."""
     parser = argparse.ArgumentParser()
     subparsers = parser.add_subparsers(required=True)
 
-    add_accuracy_subparser(subparsers)
-    add_info_subparser(subparsers)
+    accuracy.make_subparser(subparsers.add_parser("accuracy"))
+    info.make_subparser(subparsers.add_parser("info"))
 
     return parser
 
 
-###############################################################################
-#
-# Accuracy
-#
-###############################################################################
-
-
-def add_accuracy_subparser(subparsers) -> None:
-    parser = subparsers.add_parser("accuracy")
-    add_project_args(parser)
-    parser.add_argument("--save", action="store_true", default=False)
-    parser.add_argument("truth_annotator")
-    parser.add_argument("annotator")
-    parser.set_defaults(func=run_accuracy)
-
-
-def run_accuracy(args: argparse.Namespace) -> None:
-    proj_config = config.ProjectConfig(args.project_dir, config_path=args.config)
-    reader = cohort.CohortReader(proj_config)
-    accuracy(reader, args.truth_annotator, args.annotator, save=args.save)
-
-
-###############################################################################
-#
-# Info
-#
-###############################################################################
-
-
-def add_info_subparser(subparsers) -> None:
-    parser = subparsers.add_parser("info")
-    add_project_args(parser)
-    parser.set_defaults(func=run_info)
-
-
-def run_info(args: argparse.Namespace) -> None:
-    proj_config = config.ProjectConfig(args.project_dir, config_path=args.config)
-    reader = cohort.CohortReader(proj_config)
-    info(reader)
-
-
-###############################################################################
-#
-# Main CLI entrypoints
-#
-###############################################################################
-
-
 def main_cli(argv: list[str] = None) -> None:
     """Main entrypoint that wraps all the core program logic"""
     try:

diff --git a/chart_review/cli_utils.py b/chart_review/cli_utils.py
@@ -0,0 +1,19 @@
+"""Helper methods for CLI parsing."""
+
+import argparse
+
+
+def add_project_args(parser: argparse.ArgumentParser) -> None:
+    group = parser.add_argument_group("configuration")
+    group.add_argument(
+        "--project-dir",
+        default=".",
+        metavar="DIR",
+        help=(
+            "Directory holding project files, "
+            "like labelstudio-export.json (default: current dir)"
+        ),
+    )
+    group.add_argument(
+        "--config", "-c", metavar="PATH", help="Config file (default: [project-dir]/config.yaml)"
+    )
diff --git a/chart_review/cohort.py b/chart_review/cohort.py
@@ -25,12 +25,12 @@ def __init__(self, proj_config: config.ProjectConfig):
         self.project_dir = self.config.project_dir
 
         # Load exported annotations
-        saved = common.read_json(self.config.path("labelstudio-export.json"))
-        self.annotations = simplify.simplify_export(saved, self.config)
+        self.ls_export = common.read_json(self.config.path("labelstudio-export.json"))
+        self.annotations = simplify.simplify_export(self.ls_export, self.config)
 
         # Load external annotations (i.e. from NLP tags or ICD10 codes)
         for name, value in self.config.external_annotations.items():
-            external.merge_external(self.annotations, saved, self.project_dir, name, value)
+            external.merge_external(self.annotations, self.ls_export, self.project_dir, name, value)
 
         # Consolidate/expand mentions based on config
         simplify.simplify_mentions(
@@ -40,16 +40,20 @@ def __init__(self, proj_config: config.ProjectConfig):
         )
 
         # Calculate the final set of note ranges for each annotator
-        self.note_range = self._collect_note_ranges(saved)
+        self.note_range, self.ignored_notes = self._collect_note_ranges(self.ls_export)
 
-    def _collect_note_ranges(self, exported_json: list[dict]) -> dict[str, set[int]]:
+    def _collect_note_ranges(
+        self, exported_json: list[dict]
+    ) -> tuple[dict[str, set[int]], set[int]]:
         # Detect note ranges if they were not defined in the project config
         # (i.e. default to the full set of annotated notes)
         note_ranges = {k: set(v) for k, v in self.config.note_ranges.items()}
         for annotator, annotator_mentions in self.annotations.mentions.items():
             if annotator not in note_ranges:
                 note_ranges[annotator] = set(annotator_mentions.keys())
 
+        all_ls_notes = {int(entry["id"]) for entry in exported_json if "id" in entry}
+
         # Parse ignored IDs (might be note IDs, might be external IDs)
         ignored_notes: set[int] = set()
         for ignore_id in self.config.ignore:
@@ -60,15 +64,15 @@ def _collect_note_ranges(self, exported_json: list[dict]) -> dict[str, set[int]]
                 else:
                     # Must just be over-zealous excluding (like automatically from SQL)
                     continue
-            ignored_notes.add(ls_id)
+            if ls_id in all_ls_notes:
+                ignored_notes.add(ls_id)
 
         # Remove any invalid (ignored, non-existent) notes from the range sets
-        all_ls_notes = {int(entry["id"]) for entry in exported_json if "id" in entry}
         for note_ids in note_ranges.values():
             note_ids.difference_update(ignored_notes)
             note_ids.intersection_update(all_ls_notes)
 
-        return note_ranges
+        return note_ranges, ignored_notes
 
     @property
     def class_labels(self):

diff --git a/chart_review/commands/accuracy.py b/chart_review/commands/accuracy.py
@@ -1,11 +1,12 @@
 """Methods for high-level accuracy calculations."""
 
+import argparse
 import os
 
 import rich
 import rich.table
 
-from chart_review import agree, cohort, common, console_utils
+from chart_review import agree, cli_utils, cohort, common, config, console_utils
 
 
 def accuracy(reader: cohort.CohortReader, truth: str, annotator: str, save: bool = False) -> None:
@@ -61,3 +62,19 @@ def accuracy(reader: cohort.CohortReader, truth: str, annotator: str, save: bool
         for label in sorted(reader.class_labels):
             rich_table.add_row(*agree.csv_row_score(table[label]), label)
         rich.get_console().print(rich_table)
+
+
+def make_subparser(parser: argparse.ArgumentParser) -> None:
+    cli_utils.add_project_args(parser)
+    parser.add_argument(
+        "--save", action="store_true", default=False, help="Write stats to CSV & JSON files"
+    )
+    parser.add_argument("truth_annotator")
+    parser.add_argument("annotator")
+    parser.set_defaults(func=run_accuracy)
+
+
+def run_accuracy(args: argparse.Namespace) -> None:
+    proj_config = config.ProjectConfig(args.project_dir, config_path=args.config)
+    reader = cohort.CohortReader(proj_config)
+    accuracy(reader, args.truth_annotator, args.annotator, save=args.save)
diff --git a/chart_review/commands/info.py b/chart_review/commands/info.py
@@ -1,13 +1,18 @@
 """Methods for showing config & calculated setup info."""
 
+import argparse
+import csv
+import sys
+
 import rich
 import rich.box
 import rich.table
+import rich.tree
 
-from chart_review import cohort, console_utils
+from chart_review import cli_utils, cohort, config, console_utils
 
 
-def info(reader: cohort.CohortReader) -> None:
+def print_info(reader: cohort.CohortReader) -> None:
     """
     Show project information on the console.
 
@@ -34,11 +39,77 @@ def info(reader: cohort.CohortReader) -> None:
             console_utils.pretty_note_range(notes),
         )
     console.print(chart_table)
-    console.print()
+
+    # Ignored charts
+    if reader.ignored_notes:
+        ignored_count = len(reader.ignored_notes)
+        chart_word = "chart" if ignored_count == 1 else "charts"
+        pretty_ranges = console_utils.pretty_note_range(reader.ignored_notes)
+        console.print(
+            f" Ignoring {ignored_count} {chart_word} ({pretty_ranges})",
+            highlight=False,
+            style="italic",
+        )
 
     # Labels
+    console.print()
     console.print("Labels:", style="bold")
     if reader.class_labels:
         console.print(", ".join(sorted(reader.class_labels, key=str.casefold)))
     else:
         console.print("None", style="italic", highlight=False)
+
+
+def print_ids(reader: cohort.CohortReader) -> None:
+    """
+    Prints a mapping of all project IDs.
+
+    Currently, this writes a CSV file to stdout. In the future, this could get fancier.
+    At the time of writing, it wasn't clear how to present the information in a way that
+    sensible to a casual console user - so I went with the more technical-oriented CSV file.
+    """
+    writer = csv.writer(sys.stdout)
+    writer.writerow(["chart_id", "original_fhir_id", "anonymized_fhir_id"])
+
+    # IDS
+    for chart in reader.ls_export:
+        chart_id = str(chart["id"])
+        chart_data = chart.get("data", {})
+        printed = False
+
+        # Grab encounters first
+        orig_id = f"Encounter/{chart_data['enc_id']}" if "enc_id" in chart_data else ""
+        anon_id = f"Encounter/{chart_data['anon_id']}" if "anon_id" in chart_data else ""
+        if orig_id or anon_id:
+            writer.writerow([chart_id, orig_id, anon_id])
+            printed = True
+
+        # Now each DocRef ID
+        for orig_id, anon_id in chart_data.get("docref_mappings", {}).items():
+            writer.writerow(
+                [chart_id, f"DocumentReference/{orig_id}", f"DocumentReference/{anon_id}"]
+            )
+            printed = True
+
+        if not printed:
+            # Guarantee that every Chart ID shows up at least once - so it's clearer that the
+            # chart ID is included in the Label Studio export but that it does not have any
+            # IDs mapped to it.
+            writer.writerow([chart_id, None, None])
+
+
+def make_subparser(parser: argparse.ArgumentParser) -> None:
+    cli_utils.add_project_args(parser)
+    parser.add_argument(
+        "--ids", action="store_true", help="Prints a CSV of ID mappings (chart & FHIR IDs)"
+    )
+    parser.set_defaults(func=run_info)
+
+
+def run_info(args: argparse.Namespace) -> None:
+    proj_config = config.ProjectConfig(args.project_dir, config_path=args.config)
+    reader = cohort.CohortReader(proj_config)
+    if args.ids:
+        print_ids(reader)
+    else:
+        print_info(reader)
diff --git a/docs/info.md b/docs/info.md
@@ -31,6 +31,34 @@ Cough, Fatigue, Headache
 
 ## Options
 
+### `--ids`
+
+Prints a mapping of chart & FHIR IDs to the console, in CSV format.
+Redirect the output to a file to save it to disk.
+
+This is helpful when you are juggling anonymous IDs from Cumulus's Athena database
+as well as original IDs from your EHR, on top of the Label Studio chart IDs.
+
+{: .note }
+FHIR IDs could be considered PHI depending on how the EHR generates them.
+Exercise appropriate caution when sharing the output of this command.
+
+#### Examples
+
+```shell
+$ chart-review info --ids > ids.csv
+```
+
+```shell
+$ chart-review info --ids
+chart_id,original_fhir_id,anonymized_fhir_id
+1,Encounter/E123,Encounter/170a37476339af6f31ed7b1b0bbb4f11d5daacd79bf9f490d49f93742acfd2bd
+1,DocumentReference/D123,DocumentReference/331ab320fe6264535a408aa1a7ecf1465fc0631580af5f3010bfecf71c99d141
+2,Encounter/E898,Encounter/8b0bd207147989492801b7c14eebc015564ab73a07bdabdf9aefc3425eeba982
+2,DocumentReference/D898,DocumentReference/b5e329b752067eca1584f9cd132f40c637d8a9ebd6f2a599794f9436fb83c2eb
+2,DocumentReference/D899,DocumentReference/605338cd18c2617864db23fd5fd956f3e806af2021ffa6d11c34cac998eb3b6d
+```
+
 ### `--config=PATH`
 
 Use this to point to a secondary (non-default) config file.

diff --git a/tests/test_cli.py → tests/test_accuracy.py b/tests/test_cli.py → tests/test_accuracy.py
@@ -1,7 +1,5 @@
-"""Tests for cli.py"""
+"""Tests for commands/accuracy.py"""
 
-import contextlib
-import io
 import os
 import shutil
 import tempfile
@@ -12,8 +10,8 @@
 DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 
 
-class TestCommandLine(unittest.TestCase):
-    """Test case for the top-level CLI code"""
+class TestAccuracy(unittest.TestCase):
+    """Test case for the top-level accuracy code"""
 
     def setUp(self):
         super().setUp()
@@ -88,27 +86,6 @@ def test_accuracy(self):
                 accuracy_csv,
             )
 
-    def test_info(self):
-        stdout = io.StringIO()
-        with contextlib.redirect_stdout(stdout):
-            cli.main_cli(["info", "--project-dir", f"{DATA_DIR}/cold"])
-
-        self.assertEqual(
-            """Annotations:                         
-╭──────────┬─────────────┬──────────╮
-│Annotator │ Chart Count │ Chart IDs│
-├──────────┼─────────────┼──────────┤
-│jane      │ 3           │ 1, 3–4   │
-│jill      │ 4           │ 1–4      │
-│john      │ 3           │ 1–2, 4   │
-╰──────────┴─────────────┴──────────╯
-
-Labels:
-Cough, Fatigue, Headache
-""",  # noqa: W291
-            stdout.getvalue(),
-        )
-
     def test_custom_config(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             shutil.copy(f"{DATA_DIR}/cold/labelstudio-export.json", tmpdir)