Skip to content

Commit

Permalink
Merge pull request #37 from smart-on-fhir/mikix/info-ids
Browse files Browse the repository at this point in the history
feat(info): add --ids command to print ID mappings
  • Loading branch information
mikix authored Jun 7, 2024
2 parents f31f098 + 66a47fb commit 2138b29
Show file tree
Hide file tree
Showing 8 changed files with 326 additions and 113 deletions.
78 changes: 3 additions & 75 deletions chart_review/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,92 +3,20 @@
import argparse
import sys

from chart_review import cohort, config
from chart_review.commands.accuracy import accuracy
from chart_review.commands.info import info


###############################################################################
#
# CLI Helpers
#
###############################################################################


def add_project_args(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--project-dir",
default=".",
metavar="DIR",
help=(
"Directory holding project files, "
"like labelstudio-export.json (default: current dir)"
),
)
parser.add_argument(
"--config", "-c", metavar="PATH", help="Config file (default: [project-dir]/config.yaml)"
)
from chart_review.commands import accuracy, info


def define_parser() -> argparse.ArgumentParser:
"""Fills out an argument parser with all the CLI options."""
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(required=True)

add_accuracy_subparser(subparsers)
add_info_subparser(subparsers)
accuracy.make_subparser(subparsers.add_parser("accuracy"))
info.make_subparser(subparsers.add_parser("info"))

return parser


###############################################################################
#
# Accuracy
#
###############################################################################


def add_accuracy_subparser(subparsers) -> None:
parser = subparsers.add_parser("accuracy")
add_project_args(parser)
parser.add_argument("--save", action="store_true", default=False)
parser.add_argument("truth_annotator")
parser.add_argument("annotator")
parser.set_defaults(func=run_accuracy)


def run_accuracy(args: argparse.Namespace) -> None:
proj_config = config.ProjectConfig(args.project_dir, config_path=args.config)
reader = cohort.CohortReader(proj_config)
accuracy(reader, args.truth_annotator, args.annotator, save=args.save)


###############################################################################
#
# Info
#
###############################################################################


def add_info_subparser(subparsers) -> None:
parser = subparsers.add_parser("info")
add_project_args(parser)
parser.set_defaults(func=run_info)


def run_info(args: argparse.Namespace) -> None:
proj_config = config.ProjectConfig(args.project_dir, config_path=args.config)
reader = cohort.CohortReader(proj_config)
info(reader)


###############################################################################
#
# Main CLI entrypoints
#
###############################################################################


def main_cli(argv: list[str] = None) -> None:
"""Main entrypoint that wraps all the core program logic"""
try:
Expand Down
19 changes: 19 additions & 0 deletions chart_review/cli_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Helper methods for CLI parsing."""

import argparse


def add_project_args(parser: argparse.ArgumentParser) -> None:
group = parser.add_argument_group("configuration")
group.add_argument(
"--project-dir",
default=".",
metavar="DIR",
help=(
"Directory holding project files, "
"like labelstudio-export.json (default: current dir)"
),
)
group.add_argument(
"--config", "-c", metavar="PATH", help="Config file (default: [project-dir]/config.yaml)"
)
20 changes: 12 additions & 8 deletions chart_review/cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ def __init__(self, proj_config: config.ProjectConfig):
self.project_dir = self.config.project_dir

# Load exported annotations
saved = common.read_json(self.config.path("labelstudio-export.json"))
self.annotations = simplify.simplify_export(saved, self.config)
self.ls_export = common.read_json(self.config.path("labelstudio-export.json"))
self.annotations = simplify.simplify_export(self.ls_export, self.config)

# Load external annotations (i.e. from NLP tags or ICD10 codes)
for name, value in self.config.external_annotations.items():
external.merge_external(self.annotations, saved, self.project_dir, name, value)
external.merge_external(self.annotations, self.ls_export, self.project_dir, name, value)

# Consolidate/expand mentions based on config
simplify.simplify_mentions(
Expand All @@ -40,16 +40,20 @@ def __init__(self, proj_config: config.ProjectConfig):
)

# Calculate the final set of note ranges for each annotator
self.note_range = self._collect_note_ranges(saved)
self.note_range, self.ignored_notes = self._collect_note_ranges(self.ls_export)

def _collect_note_ranges(self, exported_json: list[dict]) -> dict[str, set[int]]:
def _collect_note_ranges(
self, exported_json: list[dict]
) -> tuple[dict[str, set[int]], set[int]]:
# Detect note ranges if they were not defined in the project config
# (i.e. default to the full set of annotated notes)
note_ranges = {k: set(v) for k, v in self.config.note_ranges.items()}
for annotator, annotator_mentions in self.annotations.mentions.items():
if annotator not in note_ranges:
note_ranges[annotator] = set(annotator_mentions.keys())

all_ls_notes = {int(entry["id"]) for entry in exported_json if "id" in entry}

# Parse ignored IDs (might be note IDs, might be external IDs)
ignored_notes: set[int] = set()
for ignore_id in self.config.ignore:
Expand All @@ -60,15 +64,15 @@ def _collect_note_ranges(self, exported_json: list[dict]) -> dict[str, set[int]]
else:
# Must just be over-zealous excluding (like automatically from SQL)
continue
ignored_notes.add(ls_id)
if ls_id in all_ls_notes:
ignored_notes.add(ls_id)

# Remove any invalid (ignored, non-existent) notes from the range sets
all_ls_notes = {int(entry["id"]) for entry in exported_json if "id" in entry}
for note_ids in note_ranges.values():
note_ids.difference_update(ignored_notes)
note_ids.intersection_update(all_ls_notes)

return note_ranges
return note_ranges, ignored_notes

@property
def class_labels(self):
Expand Down
19 changes: 18 additions & 1 deletion chart_review/commands/accuracy.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""Methods for high-level accuracy calculations."""

import argparse
import os

import rich
import rich.table

from chart_review import agree, cohort, common, console_utils
from chart_review import agree, cli_utils, cohort, common, config, console_utils


def accuracy(reader: cohort.CohortReader, truth: str, annotator: str, save: bool = False) -> None:
Expand Down Expand Up @@ -61,3 +62,19 @@ def accuracy(reader: cohort.CohortReader, truth: str, annotator: str, save: bool
for label in sorted(reader.class_labels):
rich_table.add_row(*agree.csv_row_score(table[label]), label)
rich.get_console().print(rich_table)


def make_subparser(parser: argparse.ArgumentParser) -> None:
cli_utils.add_project_args(parser)
parser.add_argument(
"--save", action="store_true", default=False, help="Write stats to CSV & JSON files"
)
parser.add_argument("truth_annotator")
parser.add_argument("annotator")
parser.set_defaults(func=run_accuracy)


def run_accuracy(args: argparse.Namespace) -> None:
proj_config = config.ProjectConfig(args.project_dir, config_path=args.config)
reader = cohort.CohortReader(proj_config)
accuracy(reader, args.truth_annotator, args.annotator, save=args.save)
77 changes: 74 additions & 3 deletions chart_review/commands/info.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
"""Methods for showing config & calculated setup info."""

import argparse
import csv
import sys

import rich
import rich.box
import rich.table
import rich.tree

from chart_review import cohort, console_utils
from chart_review import cli_utils, cohort, config, console_utils


def info(reader: cohort.CohortReader) -> None:
def print_info(reader: cohort.CohortReader) -> None:
"""
Show project information on the console.
Expand All @@ -34,11 +39,77 @@ def info(reader: cohort.CohortReader) -> None:
console_utils.pretty_note_range(notes),
)
console.print(chart_table)
console.print()

# Ignored charts
if reader.ignored_notes:
ignored_count = len(reader.ignored_notes)
chart_word = "chart" if ignored_count == 1 else "charts"
pretty_ranges = console_utils.pretty_note_range(reader.ignored_notes)
console.print(
f" Ignoring {ignored_count} {chart_word} ({pretty_ranges})",
highlight=False,
style="italic",
)

# Labels
console.print()
console.print("Labels:", style="bold")
if reader.class_labels:
console.print(", ".join(sorted(reader.class_labels, key=str.casefold)))
else:
console.print("None", style="italic", highlight=False)


def print_ids(reader: cohort.CohortReader) -> None:
"""
Prints a mapping of all project IDs.
Currently, this writes a CSV file to stdout. In the future, this could get fancier.
At the time of writing, it wasn't clear how to present the information in a way that
sensible to a casual console user - so I went with the more technical-oriented CSV file.
"""
writer = csv.writer(sys.stdout)
writer.writerow(["chart_id", "original_fhir_id", "anonymized_fhir_id"])

# IDS
for chart in reader.ls_export:
chart_id = str(chart["id"])
chart_data = chart.get("data", {})
printed = False

# Grab encounters first
orig_id = f"Encounter/{chart_data['enc_id']}" if "enc_id" in chart_data else ""
anon_id = f"Encounter/{chart_data['anon_id']}" if "anon_id" in chart_data else ""
if orig_id or anon_id:
writer.writerow([chart_id, orig_id, anon_id])
printed = True

# Now each DocRef ID
for orig_id, anon_id in chart_data.get("docref_mappings", {}).items():
writer.writerow(
[chart_id, f"DocumentReference/{orig_id}", f"DocumentReference/{anon_id}"]
)
printed = True

if not printed:
# Guarantee that every Chart ID shows up at least once - so it's clearer that the
# chart ID is included in the Label Studio export but that it does not have any
# IDs mapped to it.
writer.writerow([chart_id, None, None])


def make_subparser(parser: argparse.ArgumentParser) -> None:
cli_utils.add_project_args(parser)
parser.add_argument(
"--ids", action="store_true", help="Prints a CSV of ID mappings (chart & FHIR IDs)"
)
parser.set_defaults(func=run_info)


def run_info(args: argparse.Namespace) -> None:
proj_config = config.ProjectConfig(args.project_dir, config_path=args.config)
reader = cohort.CohortReader(proj_config)
if args.ids:
print_ids(reader)
else:
print_info(reader)
28 changes: 28 additions & 0 deletions docs/info.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,34 @@ Cough, Fatigue, Headache

## Options

### `--ids`

Prints a mapping of chart & FHIR IDs to the console, in CSV format.
Redirect the output to a file to save it to disk.

This is helpful when you are juggling anonymous IDs from Cumulus's Athena database
as well as original IDs from your EHR, on top of the Label Studio chart IDs.

{: .note }
FHIR IDs could be considered PHI depending on how the EHR generates them.
Exercise appropriate caution when sharing the output of this command.

#### Examples

```shell
$ chart-review info --ids > ids.csv
```

```shell
$ chart-review info --ids
chart_id,original_fhir_id,anonymized_fhir_id
1,Encounter/E123,Encounter/170a37476339af6f31ed7b1b0bbb4f11d5daacd79bf9f490d49f93742acfd2bd
1,DocumentReference/D123,DocumentReference/331ab320fe6264535a408aa1a7ecf1465fc0631580af5f3010bfecf71c99d141
2,Encounter/E898,Encounter/8b0bd207147989492801b7c14eebc015564ab73a07bdabdf9aefc3425eeba982
2,DocumentReference/D898,DocumentReference/b5e329b752067eca1584f9cd132f40c637d8a9ebd6f2a599794f9436fb83c2eb
2,DocumentReference/D899,DocumentReference/605338cd18c2617864db23fd5fd956f3e806af2021ffa6d11c34cac998eb3b6d
```

### `--config=PATH`

Use this to point to a secondary (non-default) config file.
Expand Down
29 changes: 3 additions & 26 deletions tests/test_cli.py → tests/test_accuracy.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Tests for cli.py"""
"""Tests for commands/accuracy.py"""

import contextlib
import io
import os
import shutil
import tempfile
Expand All @@ -12,8 +10,8 @@
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")


class TestCommandLine(unittest.TestCase):
"""Test case for the top-level CLI code"""
class TestAccuracy(unittest.TestCase):
"""Test case for the top-level accuracy code"""

def setUp(self):
super().setUp()
Expand Down Expand Up @@ -88,27 +86,6 @@ def test_accuracy(self):
accuracy_csv,
)

def test_info(self):
stdout = io.StringIO()
with contextlib.redirect_stdout(stdout):
cli.main_cli(["info", "--project-dir", f"{DATA_DIR}/cold"])

self.assertEqual(
"""Annotations:
╭──────────┬─────────────┬──────────╮
│Annotator │ Chart Count │ Chart IDs│
├──────────┼─────────────┼──────────┤
│jane │ 3 │ 1, 3–4 │
│jill │ 4 │ 1–4 │
│john │ 3 │ 1–2, 4 │
╰──────────┴─────────────┴──────────╯
Labels:
Cough, Fatigue, Headache
""", # noqa: W291
stdout.getvalue(),
)

def test_custom_config(self):
with tempfile.TemporaryDirectory() as tmpdir:
shutil.copy(f"{DATA_DIR}/cold/labelstudio-export.json", tmpdir)
Expand Down
Loading

0 comments on commit 2138b29

Please sign in to comment.