DOC: Add drug NER evaluation example to cookbook

medkit-lib · Jul 12, 2024 · 1a6b49d · 1a6b49d
1 parent c58e37a
commit 1a6b49d
Show file tree

Hide file tree

Showing 8 changed files with 301 additions and 0 deletions.
diff --git a/docs/cookbook/drug_ner_eval/data.tar.gz b/docs/cookbook/drug_ner_eval/data.tar.gz
diff --git a/docs/cookbook/drug_ner_eval/index.md b/docs/cookbook/drug_ner_eval/index.md
@@ -0,0 +1,2 @@
+```{include} ./pipeline.md
+```
diff --git a/docs/cookbook/drug_ner_eval/ner2_6.png b/docs/cookbook/drug_ner_eval/ner2_6.png
diff --git a/docs/cookbook/drug_ner_eval/ner2_7.png b/docs/cookbook/drug_ner_eval/ner2_7.png
diff --git a/docs/cookbook/drug_ner_eval/pipeline.md b/docs/cookbook/drug_ner_eval/pipeline.md
@@ -0,0 +1,296 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+# Comparative evaluation of drug recognition methods
+
+This pipeline compares two tools (NER1 and NER2) for recognizing drug names in clinical texts,
+compares their performance, and outputs two texts annotated with the tool evaluated as the best performer.
+
++++
+
+## Overview of the pipeline
+
++++
+
+![pipeline](./pipeline.png)
+
++++
+
+## Data preparation
+
++++
+
+Download two clinical texts, with drug entites manually annotated.
+
+```{code-cell} ipython3
+import os
+import tarfile
+import tempfile
+from pathlib import Path
+
+# path to local data
+extract_to = Path(tempfile.mkdtemp())
+data_tarfile = Path.cwd() / "data.tar.gz"
+
+# download and extract
+tarfile.open(name=data_tarfile, mode="r|gz").extractall(extract_to)
+data_dir = extract_to / "data"
+
+print(f"Data dir: {data_dir}")
+```
+
+Read text documents with medkit
+
+```{code-cell} ipython3
+from medkit.core.text import TextDocument
+
+doc_dir = data_dir / "mtsamplesen" / "annotated_doc"
+docs = TextDocument.from_dir(path=doc_dir, pattern='[A-Z0-9].txt', encoding='utf-8')
+
+print(docs[0].text)
+```
+
+## Pipeline definition
+
++++
+
+Create and run a three-step doc pipeline that: 
+1) Split sentences in texts
+2) Run PII detection for deidentification
+3) Recognize drug entities with NER1: a dictionnary-based approach named UMLSMatcher 
+4) Recognize drug entities with NER2: a Transformer-based approach, see https://huggingface.co/samrawal/bert-large-uncased_med-ner
+
++++
+
+### Sentence tokenizer
+
+```{code-cell} ipython3
+from medkit.text.segmentation import SentenceTokenizer
+
+# By default, SentenceTokenizer will use a list of punctuation chars to detect sentences.
+sentence_tokenizer = SentenceTokenizer(
+    # Label of the segments created and returned by the operation
+    output_label="sentence",
+    # Keep the punctuation character inside the sentence segments
+    keep_punct=True,
+    # Also split on newline chars, not just punctuation characters
+    split_on_newlines=True,
+)
+```
+
+### PII detector
+
+```{code-cell} ipython3
+from medkit.text.deid import PIIDetector
+
+pii_detector = PIIDetector(name="deid")
+```
+
+### Dictionnary-based drug recognizer
+
+```{code-cell} ipython3
+import shutil
+from medkit.text.ner import UMLSMatcher
+
+umls_data_dir = data_dir / "UMLS" / "2023AB" / "META"
+umls_cache_dir = Path.cwd() / ".umls_cache"
+shutil.rmtree(umls_cache_dir, ignore_errors=True)
+
+umls_matcher = UMLSMatcher(
+    # Directory containing the UMLS files with terms and concepts
+    umls_dir=umls_data_dir,
+    # Language to use (English)
+    language="ENG",
+    # Where to store the temp term database of the matcher
+    cache_dir=umls_cache_dir,
+    # Semantic groups to consider
+    semgroups=["CHEM"],
+    # Don't be case-sensitive
+    lowercase=True,
+    # Convert special chars to ASCII before matching
+    normalize_unicode=True,
+    name="NER1"
+)
+```
+
+### Transformer-based drug recognizer
+
+```{code-cell} ipython3
+from medkit.text.ner.hf_entity_matcher import HFEntityMatcher
+
+# an alternate model: "Clinical-AI-Apollo/Medical-NER"
+bert_matcher = HFEntityMatcher(
+    model="samrawal/bert-large-uncased_med-ner", name="NER2"
+)
+```
+
+### Pipeline assembly
+
+```{code-cell} ipython3
+from medkit.core import DocPipeline, Pipeline, PipelineStep
+
+pipeline = Pipeline(
+    steps=[
+        PipelineStep(sentence_tokenizer, input_keys=["full_text"], output_keys=["sentence"]),
+        PipelineStep(pii_detector, input_keys=["sentence"], output_keys=["sentence_"]),
+        PipelineStep(umls_matcher, input_keys=["sentence_"], output_keys=["ner1_drug"]),
+        PipelineStep(bert_matcher, input_keys=["sentence_"], output_keys=["ner2_drug"]),
+    ],
+    input_keys=["full_text"],
+    output_keys=["sentence_", "ner1_drug", "ner2_drug"],
+)
+
+doc_pipeline = DocPipeline(pipeline=pipeline)
+doc_pipeline.run(docs)
+```
+
+### Performance evaluation
+
+```{code-cell} ipython3
+from medkit.io.brat import BratInputConverter
+
+# Load text with annotations in medkit (our ground truth)
+brat_converter = BratInputConverter()
+ref_docs = brat_converter.load(doc_dir)
+
+# Display selected drug annotations
+for ann in ref_docs[0].anns.get(label="Drug"):
+    print(f"{ann.text} in {ann.spans}")
+```
+
+```{code-cell} ipython3
+## Compute some stats
+print(f"Number of documents: {len(docs)}")
+    
+for i, doc in enumerate(docs):
+    print(f"Document {doc.uid}:")
+
+    # On annotations made by NER1 and NER2
+    sentence_nb = len(doc.anns.get(label="sentence"))
+    print(f"\t{sentence_nb} sentences,")
+    ner1_drug_nb = len(doc.anns.get(label="chemical"))
+    print(f"\t{ner1_drug_nb} drugs found with NER1,")  
+    ner2_drug_nb = len(doc.anns.get(label="m"))
+    print(f"\t{ner2_drug_nb} drugs found with NER2,")
+
+    # On the manual annotation (our ground truth)
+    gt_nb = len(ref_docs[i].anns.get(label="Drug"))
+    print(f"\t{gt_nb} drugs manually annotated.")
+```
+
+```{code-cell} ipython3
+## Evaluate performance metrics of the NER1 and NER2 tools
+from medkit.text.metrics.ner import SeqEvalEvaluator
+import pandas as pd
+
+def results_to_df(_results, _title):
+    results_list = list(_results.items())
+    arranged_results = {"Entities": ['P', 'R', 'F1']}
+    accuracy = round(results_list[4][1], 2)
+
+    for i in range(5, len(results_list), 4):
+        key = results_list[i][0][:-10]
+        arranged_results[key] = [round(results_list[n][1], 2) for n in [i, i + 1, i + 2]]
+
+    df = pd.DataFrame(arranged_results, index=[f"{_title} (acc={accuracy})", '', '']).T
+    return df
+
+predicted_entities1=[]
+predicted_entities2=[]
+dfs = []
+
+for doc in docs:
+    predicted_entities1.append(doc.anns.get(label="chemical"))
+    predicted_entities2.append(doc.anns.get(label="m"))
+
+# Annotations of NER1 are labelled as 'chemical', NER2 as 'm', but as 'Drug' in the ground truth
+# The following dic enables remappings various labels of the same type of entites
+remapping= {"chemical": "Drug", "m": "Drug"}
+evaluator = SeqEvalEvaluator(return_metrics_by_label=True, average='weighted', labels_remapping=remapping) 
+# eval of NER2
+results1 = evaluator.compute(ref_docs, predicted_entities1)
+dfs.append(results_to_df(_results=results1, _title="NER1"))
+#print(results_to_df(_results=results1, _title="umls_matcher"))
+# eval of NER2
+results2 = evaluator.compute(ref_docs, predicted_entities2)
+dfs.append(results_to_df(_results=results2, _title="NER2"))
+
+print(pd.concat(dfs, axis=1))
+```
+
+```{code-cell} ipython3
+## Read new unannotated documents
+## Write annotations of tool NER2 in the brat format
+from medkit.io.brat import BratOutputConverter
+
+in_path = data_dir / "mtsamplesen" / "unannotated_doc"
+# reload raw documents
+final_docs = TextDocument.from_dir(
+    path=Path(in_path),
+    pattern='[A-Z0-9].txt',
+    encoding='utf-8',
+)
+# simplified pipeline, with only the best NER tool (NER2)
+pipeline2 = Pipeline(
+    steps=[
+        PipelineStep(
+            sentence_tokenizer,
+            input_keys=["full_text"],
+            output_keys=["sentence"],
+        ),
+        PipelineStep(
+            pii_detector,
+            input_keys=["sentence"],
+            output_keys=["sentence_"],
+        ),
+        PipelineStep(
+            bert_matcher,
+            input_keys=["sentence_"],
+            output_keys=["ner2_drug"],
+        ),
+    ],
+    input_keys=["full_text"],
+    output_keys=["ner2_drug"],
+)
+
+doc_pipeline2 = DocPipeline(pipeline=pipeline2)
+doc_pipeline2.run(final_docs)
+
+# filter annotations to keep only drug annotations
+# sensitive information can also be removed here
+output_docs = [
+    TextDocument(text=doc.text, anns=doc.anns.get(label="m"))
+    for doc in final_docs
+]
+
+# Define Output Converter with default params,
+# transfer all annotations and attributes
+brat_output_converter = BratOutputConverter()
+out_path = data_dir / "mtsamplesen" / "ner2_out"
+
+# save the annotation with the best tool (considering F1 only) in `out_path`
+brat_output_converter.save(
+    output_docs, 
+    dir_path=out_path,
+    doc_names=["ner2_6", "ner2_7"],
+)
+```
+
+Annotations of the discharge summary 6.txt, displayed with Brat
+
+![](./ner2_6.png)
+
++++
+
+Annotations of the discharge summary 7.txt (partial view), displayed with Brat
+
+![](./ner2_7.png)
diff --git a/docs/cookbook/drug_ner_eval/pipeline.png b/docs/cookbook/drug_ner_eval/pipeline.png
diff --git a/docs/index.md b/docs/index.md
@@ -95,6 +95,7 @@ cookbook/audio_transcription
 cookbook/audio_dataset_metrics
 cookbook/ontotox
 cookbook/ner_benchmark/index
+cookbook/drug_ner_eval/index
 ```
 
 ```{toctree}

diff --git a/pyproject.toml b/pyproject.toml
@@ -212,8 +212,10 @@ cov = [
 
 [tool.hatch.envs.docs]
 features = [
+  "deid",
   "docs",
   "hf-entity-matcher",
+  "metrics-ner",
   "spacy",
 ]
 python = "3.12"