From 6110d51733e9dfe6edf2f724ea22957361de7ba8 Mon Sep 17 00:00:00 2001
From: nikhilranjan7 <nikhilranjan7@gmail.com>
Date: Wed, 19 Oct 2022 14:00:57 +0400
Subject: [PATCH] start example

---
 .../sample_data/notes.txt                     |  2 +
 .../medical_text_understanding/scispacy.py    | 91 +++++++++++++++++++
 fortex/health/processors/test.py              | 18 ++++
 3 files changed, 111 insertions(+)
 create mode 100644 examples/medical_text_understanding/sample_data/notes.txt
 create mode 100644 examples/medical_text_understanding/scispacy.py
 create mode 100644 fortex/health/processors/test.py

diff --git a/examples/medical_text_understanding/sample_data/notes.txt b/examples/medical_text_understanding/sample_data/notes.txt
new file mode 100644
index 00000000..96ff5feb
--- /dev/null
+++ b/examples/medical_text_understanding/sample_data/notes.txt
@@ -0,0 +1,2 @@
+Spinal and bulbar muscular atrophy (SBMA) is an inherited motor neuron disease caused by the expansion of a polyglutamine tract within the androgen receptor (AR). SBMA can be caused by this easily.
+Keystone plant species such as fig trees are good for the soil.
\ No newline at end of file
diff --git a/examples/medical_text_understanding/scispacy.py b/examples/medical_text_understanding/scispacy.py
new file mode 100644
index 00000000..f503b60e
--- /dev/null
+++ b/examples/medical_text_understanding/scispacy.py
@@ -0,0 +1,91 @@
+import sys
+from termcolor import colored
+
+from forte.data.data_pack import DataPack
+from forte.data.readers import PlainTextReader
+from forte.pipeline import Pipeline
+from forte.processors.writers import PackIdJsonPackWriter
+
+from ft.onto.base_ontology import (
+    Token,
+)
+from fortex.spacy import SpacyProcessor
+
+from ftx.medical.clinical_ontology import Hyponym, Abbreviation, Phrase
+from fortex.health.processors.scispacy_processor import (
+    ScispaCyProcessor,
+)
+
+
+def main(
+    input_path: str,  # Path to mimic3 data if use_mimic3_reader=True else path to notes directory
+    output_path: str,  # Path to output directory
+    max_packs: int = -1,  # Max number of notes to read from mimic3 dataset. Set to -1 to read all.
+    use_mimic3_reader: bool = True,  # Read from mimic3 dataset or plain text
+):
+    pl = Pipeline[DataPack]()
+
+    if use_mimic3_reader is False:
+        pl.set_reader(PlainTextReader())
+    else:
+        pl.set_reader(Mimic3DischargeNoteReader(), config={"max_num_notes": max_packs})
+
+    pl.add(
+        SpacyProcessor(),
+        {"processors": ["sentence"], "lang": "en_ner_bionlp13cg_md"},
+    )
+    pl.add(
+        ICDCodingProcessor(),
+        {
+            "entry_type": "ft.onto.base_ontology.Document",
+            "attribute_name": "classification",
+            "multi_class": True,
+            "model_name": "AkshatSurolia/ICD-10-Code-Prediction",  # You can use other ICD predictors here.
+            "cuda_devices": -1,
+        },
+    )
+    pl.add(
+        PackIdJsonPackWriter(),
+        {
+            "output_dir": output_path,
+            "indent": 2,
+            "overwrite": True,
+            "drop_record": True,
+            "zip_pack": True,
+        },
+    )
+
+    pl.initialize()
+
+    packs = pl.process_dataset(input_path)
+    for pack in packs:
+        show_data(pack)
+
+
+def show_data(pack: DataPack):
+    # The ICD processor predicts ICD code for each article.
+    # The result is stored as article.icd_code.
+    # The articles are packed into DataPack.
+    # Therefore, we first extract articles from DataPack and then get their ICD codes.
+
+    for article in pack.get(MedicalArticle):
+        article_text = article.text
+
+        # get the ICD code and its coding version
+        icd_code = article.icd_code
+        icd_version = article.icd_version
+
+        print(colored("Article:", "red"), article_text, "\n")
+        print(colored(f"ICD-{icd_version} Code:", "cyan"), icd_code, "\n")
+
+        input(colored("Press ENTER to continue...\n", "green"))
+
+
+# Examples:
+#
+# Read from MIMIC3:
+# python icd_coding.py /path/to/mimiciii/1.4/NOTEEVENTS.csv.gz /path_to_sample_output 1000 True
+#
+# Read from sample_data:
+# python icd_coding.py sample_data/ /path_to_sample_output 1000 False
+main(sys.argv[1], sys.argv[2], int(sys.argv[3]), sys.argv[4].lower() == "true")
diff --git a/fortex/health/processors/test.py b/fortex/health/processors/test.py
new file mode 100644
index 00000000..d9fd9f86
--- /dev/null
+++ b/fortex/health/processors/test.py
@@ -0,0 +1,18 @@
+import spacy
+from timexy import Timexy
+
+nlp = spacy.load("en_core_web_sm")
+
+# Optionally add config if varying from default values
+config = {
+    "kb_id_type": "timex3",  # possible values: 'timex3'(default), 'timestamp'
+    "label": "timexy",  # default: 'timexy'
+    "overwrite": False,  # default: False
+}
+nlp.add_pipe("timexy", config=config, before="ner")
+
+doc = nlp(
+    "Today is the 10.10.2010. I was in Paris for six years. 2 pm 3 days ago"
+)
+for e in doc.ents:
+    print(f"{e.text}\t{e.label_}\t{e.kb_id_}")