Skip to content

Commit

Permalink
start example
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilranjan7 committed Oct 19, 2022
1 parent b7fceb6 commit 6110d51
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 0 deletions.
2 changes: 2 additions & 0 deletions examples/medical_text_understanding/sample_data/notes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Spinal and bulbar muscular atrophy (SBMA) is an inherited motor neuron disease caused by the expansion of a polyglutamine tract within the androgen receptor (AR). SBMA can be caused by this easily.
Keystone plant species such as fig trees are good for the soil.
91 changes: 91 additions & 0 deletions examples/medical_text_understanding/scispacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import sys
from termcolor import colored

from forte.data.data_pack import DataPack
from forte.data.readers import PlainTextReader
from forte.pipeline import Pipeline
from forte.processors.writers import PackIdJsonPackWriter

from ft.onto.base_ontology import (
Token,
)
from fortex.spacy import SpacyProcessor

from ftx.medical.clinical_ontology import Hyponym, Abbreviation, Phrase
from fortex.health.processors.scispacy_processor import (
ScispaCyProcessor,
)


def main(
input_path: str, # Path to mimic3 data if use_mimic3_reader=True else path to notes directory
output_path: str, # Path to output directory
max_packs: int = -1, # Max number of notes to read from mimic3 dataset. Set to -1 to read all.
use_mimic3_reader: bool = True, # Read from mimic3 dataset or plain text
):
pl = Pipeline[DataPack]()

if use_mimic3_reader is False:
pl.set_reader(PlainTextReader())
else:
pl.set_reader(Mimic3DischargeNoteReader(), config={"max_num_notes": max_packs})

pl.add(
SpacyProcessor(),
{"processors": ["sentence"], "lang": "en_ner_bionlp13cg_md"},
)
pl.add(
ICDCodingProcessor(),
{
"entry_type": "ft.onto.base_ontology.Document",
"attribute_name": "classification",
"multi_class": True,
"model_name": "AkshatSurolia/ICD-10-Code-Prediction", # You can use other ICD predictors here.
"cuda_devices": -1,
},
)
pl.add(
PackIdJsonPackWriter(),
{
"output_dir": output_path,
"indent": 2,
"overwrite": True,
"drop_record": True,
"zip_pack": True,
},
)

pl.initialize()

packs = pl.process_dataset(input_path)
for pack in packs:
show_data(pack)


def show_data(pack: DataPack):
# The ICD processor predicts ICD code for each article.
# The result is stored as article.icd_code.
# The articles are packed into DataPack.
# Therefore, we first extract articles from DataPack and then get their ICD codes.

for article in pack.get(MedicalArticle):
article_text = article.text

# get the ICD code and its coding version
icd_code = article.icd_code
icd_version = article.icd_version

print(colored("Article:", "red"), article_text, "\n")
print(colored(f"ICD-{icd_version} Code:", "cyan"), icd_code, "\n")

input(colored("Press ENTER to continue...\n", "green"))


# Examples:
#
# Read from MIMIC3:
# python icd_coding.py /path/to/mimiciii/1.4/NOTEEVENTS.csv.gz /path_to_sample_output 1000 True
#
# Read from sample_data:
# python icd_coding.py sample_data/ /path_to_sample_output 1000 False
main(sys.argv[1], sys.argv[2], int(sys.argv[3]), sys.argv[4].lower() == "true")
18 changes: 18 additions & 0 deletions fortex/health/processors/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import spacy
from timexy import Timexy

nlp = spacy.load("en_core_web_sm")

# Optionally add config if varying from default values
config = {
"kb_id_type": "timex3", # possible values: 'timex3'(default), 'timestamp'
"label": "timexy", # default: 'timexy'
"overwrite": False, # default: False
}
nlp.add_pipe("timexy", config=config, before="ner")

doc = nlp(
"Today is the 10.10.2010. I was in Paris for six years. 2 pm 3 days ago"
)
for e in doc.ents:
print(f"{e.text}\t{e.label_}\t{e.kb_id_}")

0 comments on commit 6110d51

Please sign in to comment.