From 6110d51733e9dfe6edf2f724ea22957361de7ba8 Mon Sep 17 00:00:00 2001 From: nikhilranjan7 <nikhilranjan7@gmail.com> Date: Wed, 19 Oct 2022 14:00:57 +0400 Subject: [PATCH] start example --- .../sample_data/notes.txt | 2 + .../medical_text_understanding/scispacy.py | 91 +++++++++++++++++++ fortex/health/processors/test.py | 18 ++++ 3 files changed, 111 insertions(+) create mode 100644 examples/medical_text_understanding/sample_data/notes.txt create mode 100644 examples/medical_text_understanding/scispacy.py create mode 100644 fortex/health/processors/test.py diff --git a/examples/medical_text_understanding/sample_data/notes.txt b/examples/medical_text_understanding/sample_data/notes.txt new file mode 100644 index 00000000..96ff5feb --- /dev/null +++ b/examples/medical_text_understanding/sample_data/notes.txt @@ -0,0 +1,2 @@ +Spinal and bulbar muscular atrophy (SBMA) is an inherited motor neuron disease caused by the expansion of a polyglutamine tract within the androgen receptor (AR). SBMA can be caused by this easily. +Keystone plant species such as fig trees are good for the soil. \ No newline at end of file diff --git a/examples/medical_text_understanding/scispacy.py b/examples/medical_text_understanding/scispacy.py new file mode 100644 index 00000000..f503b60e --- /dev/null +++ b/examples/medical_text_understanding/scispacy.py @@ -0,0 +1,91 @@ +import sys +from termcolor import colored + +from forte.data.data_pack import DataPack +from forte.data.readers import PlainTextReader +from forte.pipeline import Pipeline +from forte.processors.writers import PackIdJsonPackWriter + +from ft.onto.base_ontology import ( + Token, +) +from fortex.spacy import SpacyProcessor + +from ftx.medical.clinical_ontology import Hyponym, Abbreviation, Phrase +from fortex.health.processors.scispacy_processor import ( + ScispaCyProcessor, +) + + +def main( + input_path: str, # Path to mimic3 data if use_mimic3_reader=True else path to notes directory + output_path: str, # Path to output directory + max_packs: int = -1, # Max number of notes to read from mimic3 dataset. Set to -1 to read all. + use_mimic3_reader: bool = True, # Read from mimic3 dataset or plain text +): + pl = Pipeline[DataPack]() + + if use_mimic3_reader is False: + pl.set_reader(PlainTextReader()) + else: + pl.set_reader(Mimic3DischargeNoteReader(), config={"max_num_notes": max_packs}) + + pl.add( + SpacyProcessor(), + {"processors": ["sentence"], "lang": "en_ner_bionlp13cg_md"}, + ) + pl.add( + ICDCodingProcessor(), + { + "entry_type": "ft.onto.base_ontology.Document", + "attribute_name": "classification", + "multi_class": True, + "model_name": "AkshatSurolia/ICD-10-Code-Prediction", # You can use other ICD predictors here. + "cuda_devices": -1, + }, + ) + pl.add( + PackIdJsonPackWriter(), + { + "output_dir": output_path, + "indent": 2, + "overwrite": True, + "drop_record": True, + "zip_pack": True, + }, + ) + + pl.initialize() + + packs = pl.process_dataset(input_path) + for pack in packs: + show_data(pack) + + +def show_data(pack: DataPack): + # The ICD processor predicts ICD code for each article. + # The result is stored as article.icd_code. + # The articles are packed into DataPack. + # Therefore, we first extract articles from DataPack and then get their ICD codes. + + for article in pack.get(MedicalArticle): + article_text = article.text + + # get the ICD code and its coding version + icd_code = article.icd_code + icd_version = article.icd_version + + print(colored("Article:", "red"), article_text, "\n") + print(colored(f"ICD-{icd_version} Code:", "cyan"), icd_code, "\n") + + input(colored("Press ENTER to continue...\n", "green")) + + +# Examples: +# +# Read from MIMIC3: +# python icd_coding.py /path/to/mimiciii/1.4/NOTEEVENTS.csv.gz /path_to_sample_output 1000 True +# +# Read from sample_data: +# python icd_coding.py sample_data/ /path_to_sample_output 1000 False +main(sys.argv[1], sys.argv[2], int(sys.argv[3]), sys.argv[4].lower() == "true") diff --git a/fortex/health/processors/test.py b/fortex/health/processors/test.py new file mode 100644 index 00000000..d9fd9f86 --- /dev/null +++ b/fortex/health/processors/test.py @@ -0,0 +1,18 @@ +import spacy +from timexy import Timexy + +nlp = spacy.load("en_core_web_sm") + +# Optionally add config if varying from default values +config = { + "kb_id_type": "timex3", # possible values: 'timex3'(default), 'timestamp' + "label": "timexy", # default: 'timexy' + "overwrite": False, # default: False +} +nlp.add_pipe("timexy", config=config, before="ner") + +doc = nlp( + "Today is the 10.10.2010. I was in Paris for six years. 2 pm 3 days ago" +) +for e in doc.ents: + print(f"{e.text}\t{e.label_}\t{e.kb_id_}")