From 80cfe19926c0596edd13985581e8ca01a7be86ad Mon Sep 17 00:00:00 2001 From: feipenghe Date: Wed, 9 Feb 2022 13:14:04 -0800 Subject: [PATCH] Biological NER predictor pack() missing context parameter (#85) --- .github/workflows/main.yml | 5 +- .../fortex/huggingface/bio_ner_predictor.py | 15 +-- src/huggingface/setup.py | 1 + .../bio_ner_predictor/D_ICD_DIAGNOSES.csv | 10 ++ .../bio_ner_predictor/bio_ner_config.yml | 7 ++ .../bio_ner_predictor/demo/.generated | 0 .../bio_ner_predictor/demo/__init__.py | 1 + .../bio_ner_predictor/demo/clinical.py | 49 ++++++++++ .../bio_ner_predictor/mimic3_note_reader.py | 80 ++++++++++++++++ tests/wrappers/bio_ner_predictor_test.py | 93 +++++++++++++++++++ 10 files changed, 253 insertions(+), 8 deletions(-) create mode 100644 tests/wrappers/bio_ner_predictor/D_ICD_DIAGNOSES.csv create mode 100644 tests/wrappers/bio_ner_predictor/bio_ner_config.yml create mode 100644 tests/wrappers/bio_ner_predictor/demo/.generated create mode 100644 tests/wrappers/bio_ner_predictor/demo/__init__.py create mode 100644 tests/wrappers/bio_ner_predictor/demo/clinical.py create mode 100644 tests/wrappers/bio_ner_predictor/mimic3_note_reader.py create mode 100644 tests/wrappers/bio_ner_predictor_test.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ba21d7d..f926bf7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -90,6 +90,7 @@ jobs: - { dep: elastic, testfile: tests/wrappers/elastic_indexers_test.py } - { dep: faiss, testfile: tests/wrappers/faiss_indexers_test.py } - { dep: "huggingface nltk", extra: "'tensorflow>=2.5.0,<2.8.0'", testfile: tests/wrappers/huggingface } + - { dep: "huggingface elastic nltk", testfile: tests/wrappers/bio_ner_predictor_test.py} steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} @@ -127,7 +128,9 @@ jobs: cd forte pip install --progress-bar off . - name: Start elastic server if test elastic search - if: ${{ matrix.test-details.dep == 'elastic' }} + if: ${{ matrix.test-details.dep == 'elastic' || + contains(matrix.test-details.dep, 'elastic') + }} run: | wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.4.2-linux-x86_64.tar.gz tar -zxf elasticsearch-7.4.2-linux-x86_64.tar.gz diff --git a/src/huggingface/fortex/huggingface/bio_ner_predictor.py b/src/huggingface/fortex/huggingface/bio_ner_predictor.py index c60294f..16020be 100644 --- a/src/huggingface/fortex/huggingface/bio_ner_predictor.py +++ b/src/huggingface/fortex/huggingface/bio_ner_predictor.py @@ -1,11 +1,13 @@ # pylint: disable=logging-fstring-interpolation from typing import Dict, List, Optional, Tuple, Any, Set - +import logging import numpy as np import torch + from forte.common.configuration import Config from forte.common.resources import Resources from forte.data.data_pack import DataPack +from forte.data.ontology.top import Annotation from forte.processors.base.batch_processor import RequestPackingProcessor from ft.onto.base_ontology import EntityMention, Subword from transformers import ( @@ -191,15 +193,16 @@ def pack( self, data_pack: DataPack, output_dict: Optional[Dict[str, Dict[str, List[Any]]]] = None, + context: Optional[Annotation] = None, ): """ - Write the prediction results back to datapack. by writing the predicted - ner to the original subwords and convert predictions to something that - makes sense in a word-by-word segmentation + Write the prediction results back to datapack by aggregating subwords + into named entity mentions. """ - if output_dict is None: return + if context is not None: + logging.warning("context parameter is not used in pack() method.") for i in range(len(output_dict["Subword"]["tid"])): tids = output_dict["Subword"]["tid"][i] @@ -211,7 +214,6 @@ def pack( for idx, (label, tid) in enumerate(zip(labels, tids)) if label not in self.ft_configs.ignore_labels ] - entity_groups = self._compose_entities(entities, data_pack, tids) # Add NER tags and create EntityMention ontologies. for first_idx, last_idx in entity_groups: @@ -220,7 +222,6 @@ def pack( last_token: Subword = data_pack.get_entry(tids[last_idx]) end = last_token.span.end - entity = EntityMention(data_pack, begin, end) entity.ner_type = self.ft_configs.ner_type diff --git a/src/huggingface/setup.py b/src/huggingface/setup.py index 4922f8d..b699411 100644 --- a/src/huggingface/setup.py +++ b/src/huggingface/setup.py @@ -31,6 +31,7 @@ "forte==0.1.2", "more-itertools>=8.0.0", "transformers == 4.2.2", + "numpy == 1.19.5", ], classifiers=[ "Intended Audience :: Developers", diff --git a/tests/wrappers/bio_ner_predictor/D_ICD_DIAGNOSES.csv b/tests/wrappers/bio_ner_predictor/D_ICD_DIAGNOSES.csv new file mode 100644 index 0000000..017326e --- /dev/null +++ b/tests/wrappers/bio_ner_predictor/D_ICD_DIAGNOSES.csv @@ -0,0 +1,10 @@ +row_id,icd9_code,short_title,long_title +1,01716,Erythem nod tb-oth test,"Erythema nodosum with hypersensitivity reaction in tuberculosis, tubercle bacilli not found by bacteriological or histological examination, but tuberculosis confirmed by other methods [inoculation of animals]" +378,0879,Relapsing fever NOS,"Relapsing fever, unspecified" +379,0880,Bartonellosis,Bartonellosis +380,08881,Lyme disease,Lyme Disease +392,0905,Late congen syph symptom,"Other late congenital syphilis, symptomatic" +420,09324,Syphil pulmonary valve,Syphilitic endocarditis of pulmonary valve +434,09486,Syphil acoustic neuritis,Syphilitic acoustic neuritis +463,09830,Chr gc upper gu NOS,"Chronic gonococcal infection of upper genitourinary tract, site unspecified" +523,04521,Nonparalyt polio-type 1,"Acute nonparalytic poliomyelitis, poliovirus type I" \ No newline at end of file diff --git a/tests/wrappers/bio_ner_predictor/bio_ner_config.yml b/tests/wrappers/bio_ner_predictor/bio_ner_config.yml new file mode 100644 index 0000000..4672a70 --- /dev/null +++ b/tests/wrappers/bio_ner_predictor/bio_ner_config.yml @@ -0,0 +1,7 @@ +BERTTokenizer: + model_path: "resources/NCBI-disease" + +BioBERTNERPredictor: + model_path: "resources/NCBI-disease" + ner_type: "DISEASE" + ignore_labels: ["O"] \ No newline at end of file diff --git a/tests/wrappers/bio_ner_predictor/demo/.generated b/tests/wrappers/bio_ner_predictor/demo/.generated new file mode 100644 index 0000000..e69de29 diff --git a/tests/wrappers/bio_ner_predictor/demo/__init__.py b/tests/wrappers/bio_ner_predictor/demo/__init__.py new file mode 100644 index 0000000..49ecbbf --- /dev/null +++ b/tests/wrappers/bio_ner_predictor/demo/__init__.py @@ -0,0 +1 @@ +# ***automatically_generated*** diff --git a/tests/wrappers/bio_ner_predictor/demo/clinical.py b/tests/wrappers/bio_ner_predictor/demo/clinical.py new file mode 100644 index 0000000..68541b4 --- /dev/null +++ b/tests/wrappers/bio_ner_predictor/demo/clinical.py @@ -0,0 +1,49 @@ +# ***automatically_generated*** +# ***source json:examples/clinical_pipeline/clinical_onto.json*** +# flake8: noqa +# mypy: ignore-errors +# pylint: skip-file +""" +Automatically generated ontology clinical. Do not change manually. +""" + +from dataclasses import dataclass +from forte.data.data_pack import DataPack +from forte.data.ontology.top import Annotation +from ft.onto.base_ontology import EntityMention + +__all__ = [ + "ClinicalEntityMention", + "Description", + "Body", +] + + +@dataclass +class ClinicalEntityMention(EntityMention): + """ + A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Description(Annotation): + """ + A span based annotation `Description`, used to represent the description in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Body(Annotation): + """ + A span based annotation `Body`, used to represent the actual content in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) diff --git a/tests/wrappers/bio_ner_predictor/mimic3_note_reader.py b/tests/wrappers/bio_ner_predictor/mimic3_note_reader.py new file mode 100644 index 0000000..1f43f5c --- /dev/null +++ b/tests/wrappers/bio_ner_predictor/mimic3_note_reader.py @@ -0,0 +1,80 @@ +# Copyright 2021 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import logging +from pathlib import Path +from typing import Any, Iterator, Union, List + +from smart_open import open + +from bio_ner_predictor.demo.clinical import Description, Body +from forte.data.data_pack import DataPack +from forte.data.base_reader import PackReader + + +class Mimic3DischargeNoteReader(PackReader): + """This class is designed to read the discharge notes from MIMIC3 dataset + as plain text packs. + + For more information for the dataset, visit: + https://mimic.physionet.org/ + """ + + def __init__(self): + super().__init__() + self.headers: List[str] = [] + self.text_col = -1 # Default to be last column. + self.description_col = 0 # Default to be first column. + self.__note_count = 0 # Count number of notes processed. + + def _collect( # type: ignore + self, mimic3_path: Union[Path, str] + ) -> Iterator[Any]: + with open(mimic3_path) as f: + for r in csv.reader(f): + if 0 < self.configs.max_num_notes <= self.__note_count: + break + yield r + + def _parse_pack(self, row: List[str]) -> Iterator[DataPack]: + if len(self.headers) == 0: + self.headers.extend(row) + for i, h in enumerate(self.headers): + if h == "TEXT": + self.text_col = i + logging.info("Text Column is %d", i) + if h == "DESCRIPTION": + self.description_col = i + logging.info("Description Column is %d", i) + else: + pack: DataPack = DataPack() + description: str = row[self.description_col] + text: str = row[self.text_col] + delimiter = "\n-----------------\n" + full_text = description + delimiter + text + pack.set_text(full_text) + + Description(pack, 0, len(description)) + Body(pack, len(description) + len(delimiter), len(full_text)) + self.__note_count += 1 + yield pack + + @classmethod + def default_configs(cls): + config = super().default_configs() + # If this is set (>0), the reader will only read up to + # the number specified. + config["max_num_notes"] = -1 + return config diff --git a/tests/wrappers/bio_ner_predictor_test.py b/tests/wrappers/bio_ner_predictor_test.py new file mode 100644 index 0000000..e548e2c --- /dev/null +++ b/tests/wrappers/bio_ner_predictor_test.py @@ -0,0 +1,93 @@ +import sys +import time +import os +import yaml +from bio_ner_predictor.mimic3_note_reader import Mimic3DischargeNoteReader + +from fortex.elastic import ElasticSearchPackIndexProcessor +from fortex.huggingface.bio_ner_predictor import BioBERTNERPredictor +from fortex.huggingface.transformers_processor import BERTTokenizer + +from forte.common.configuration import Config +from forte.data.data_pack import DataPack +from forte.pipeline import Pipeline +from forte.processors.writers import PackIdJsonPackWriter +from fortex.nltk import NLTKSentenceSegmenter +import unittest +from ddt import ddt, data, unpack +from forte.data.data_utils import maybe_download +from ft.onto.base_ontology import EntityMention + +@ddt +class TestBioNerPredictor(unittest.TestCase): + r"""Tests Elastic Indexer.""" + + def setUp(self): + self.pl = Pipeline[DataPack]() + + script_dir_path = os.path.dirname(os.path.abspath(__file__)) + data_folder = "bio_ner_predictor" + self.output_path = os.path.join(script_dir_path,data_folder, "test_case_output/") + config_path = os.path.join(script_dir_path,data_folder,"bio_ner_config.yml") + self.input_path = os.path.join(script_dir_path,data_folder, "D_ICD_DIAGNOSES.csv") + self.num_packs = 5 + + # download resources + urls = [ + "https://drive.google.com/file/d/15RSfFkW9syQKtx-_fQ9KshN3BJ27Jf8t/" + "view?usp=sharing", + "https://drive.google.com/file/d/1Nh7D6Xam5JefdoSXRoL7S0DZK1d4i2UK/" + "view?usp=sharing", + "https://drive.google.com/file/d/1YWcI60lGKtTFH01Ai1HnwOKBsrFf2r29/" + "view?usp=sharing", + "https://drive.google.com/file/d/1ElHUEMPQIuWmV0GimroqFphbCvFKskYj/" + "view?usp=sharing", + "https://drive.google.com/file/d/1EhMXlieoEg-bGUbbQ2vN-iyNJvC4Dajl/" + "view?usp=sharing", + ] + + filenames = [ + "config.json", + "pytorch_model.bin", + "special_tokens_map.json", + "tokenizer_config.json", + "vocab.txt", + ] + model_path = os.path.abspath("resources/NCBI-disease") + config = yaml.safe_load(open(config_path, "r")) + config = Config(config, default_hparams=None) + config.BERTTokenizer.model_path = model_path + config.BioBERTNERPredictor.model_path = model_path + maybe_download(urls=urls, path=model_path, filenames=filenames) + self.assertTrue(os.path.exists(os.path.join(model_path, "pytorch_model.bin"))) + self.pl.set_reader( + Mimic3DischargeNoteReader(), config={"max_num_notes": self.num_packs} + ) + self.pl.add(NLTKSentenceSegmenter()) + + + + self.pl.add(BERTTokenizer(), config=config.BERTTokenizer) + self.pl.add(BioBERTNERPredictor(), config=config.BioBERTNERPredictor) + self.pl.add(ElasticSearchPackIndexProcessor()) + self.pl.add( + PackIdJsonPackWriter(), + { + "output_dir": self.output_path, + "indent": 2, + "overwrite": True, + "drop_record": True, + "zip_pack": True, + }, + ) + self.pl.initialize() + + def test_predict(self): + for idx, data_pack in enumerate(self.pl.process_dataset(self.input_path)): + ems = list(data_pack.get_data(EntityMention)) + self.assertTrue(len(ems) > 0) + + self.assertEqual(len(os.listdir(self.output_path)), self.num_packs) + for f_name in os.listdir(self.output_path): + os.remove(os.path.join(self.output_path, f_name)) + os.removedirs(self.output_path)