Skip to content

Commit

Permalink
Biological NER predictor pack() missing context parameter (#85)
Browse files Browse the repository at this point in the history
  • Loading branch information
hepengfe authored Feb 9, 2022
1 parent 9ea5c44 commit 80cfe19
Show file tree
Hide file tree
Showing 10 changed files with 253 additions and 8 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ jobs:
- { dep: elastic, testfile: tests/wrappers/elastic_indexers_test.py }
- { dep: faiss, testfile: tests/wrappers/faiss_indexers_test.py }
- { dep: "huggingface nltk", extra: "'tensorflow>=2.5.0,<2.8.0'", testfile: tests/wrappers/huggingface }
- { dep: "huggingface elastic nltk", testfile: tests/wrappers/bio_ner_predictor_test.py}
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand Down Expand Up @@ -127,7 +128,9 @@ jobs:
cd forte
pip install --progress-bar off .
- name: Start elastic server if test elastic search
if: ${{ matrix.test-details.dep == 'elastic' }}
if: ${{ matrix.test-details.dep == 'elastic' ||
contains(matrix.test-details.dep, 'elastic')
}}
run: |
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.4.2-linux-x86_64.tar.gz
tar -zxf elasticsearch-7.4.2-linux-x86_64.tar.gz
Expand Down
15 changes: 8 additions & 7 deletions src/huggingface/fortex/huggingface/bio_ner_predictor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# pylint: disable=logging-fstring-interpolation
from typing import Dict, List, Optional, Tuple, Any, Set

import logging
import numpy as np
import torch

from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.data.ontology.top import Annotation
from forte.processors.base.batch_processor import RequestPackingProcessor
from ft.onto.base_ontology import EntityMention, Subword
from transformers import (
Expand Down Expand Up @@ -191,15 +193,16 @@ def pack(
self,
data_pack: DataPack,
output_dict: Optional[Dict[str, Dict[str, List[Any]]]] = None,
context: Optional[Annotation] = None,
):
"""
Write the prediction results back to datapack. by writing the predicted
ner to the original subwords and convert predictions to something that
makes sense in a word-by-word segmentation
Write the prediction results back to datapack by aggregating subwords
into named entity mentions.
"""

if output_dict is None:
return
if context is not None:
logging.warning("context parameter is not used in pack() method.")

for i in range(len(output_dict["Subword"]["tid"])):
tids = output_dict["Subword"]["tid"][i]
Expand All @@ -211,7 +214,6 @@ def pack(
for idx, (label, tid) in enumerate(zip(labels, tids))
if label not in self.ft_configs.ignore_labels
]

entity_groups = self._compose_entities(entities, data_pack, tids)
# Add NER tags and create EntityMention ontologies.
for first_idx, last_idx in entity_groups:
Expand All @@ -220,7 +222,6 @@ def pack(

last_token: Subword = data_pack.get_entry(tids[last_idx])
end = last_token.span.end

entity = EntityMention(data_pack, begin, end)
entity.ner_type = self.ft_configs.ner_type

Expand Down
1 change: 1 addition & 0 deletions src/huggingface/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"forte==0.1.2",
"more-itertools>=8.0.0",
"transformers == 4.2.2",
"numpy == 1.19.5",
],
classifiers=[
"Intended Audience :: Developers",
Expand Down
10 changes: 10 additions & 0 deletions tests/wrappers/bio_ner_predictor/D_ICD_DIAGNOSES.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
row_id,icd9_code,short_title,long_title
1,01716,Erythem nod tb-oth test,"Erythema nodosum with hypersensitivity reaction in tuberculosis, tubercle bacilli not found by bacteriological or histological examination, but tuberculosis confirmed by other methods [inoculation of animals]"
378,0879,Relapsing fever NOS,"Relapsing fever, unspecified"
379,0880,Bartonellosis,Bartonellosis
380,08881,Lyme disease,Lyme Disease
392,0905,Late congen syph symptom,"Other late congenital syphilis, symptomatic"
420,09324,Syphil pulmonary valve,Syphilitic endocarditis of pulmonary valve
434,09486,Syphil acoustic neuritis,Syphilitic acoustic neuritis
463,09830,Chr gc upper gu NOS,"Chronic gonococcal infection of upper genitourinary tract, site unspecified"
523,04521,Nonparalyt polio-type 1,"Acute nonparalytic poliomyelitis, poliovirus type I"
7 changes: 7 additions & 0 deletions tests/wrappers/bio_ner_predictor/bio_ner_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
BERTTokenizer:
model_path: "resources/NCBI-disease"

BioBERTNERPredictor:
model_path: "resources/NCBI-disease"
ner_type: "DISEASE"
ignore_labels: ["O"]
Empty file.
1 change: 1 addition & 0 deletions tests/wrappers/bio_ner_predictor/demo/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# ***automatically_generated***
49 changes: 49 additions & 0 deletions tests/wrappers/bio_ner_predictor/demo/clinical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# ***automatically_generated***
# ***source json:examples/clinical_pipeline/clinical_onto.json***
# flake8: noqa
# mypy: ignore-errors
# pylint: skip-file
"""
Automatically generated ontology clinical. Do not change manually.
"""

from dataclasses import dataclass
from forte.data.data_pack import DataPack
from forte.data.ontology.top import Annotation
from ft.onto.base_ontology import EntityMention

__all__ = [
"ClinicalEntityMention",
"Description",
"Body",
]


@dataclass
class ClinicalEntityMention(EntityMention):
"""
A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text.
"""

def __init__(self, pack: DataPack, begin: int, end: int):
super().__init__(pack, begin, end)


@dataclass
class Description(Annotation):
"""
A span based annotation `Description`, used to represent the description in a piece of clinical note.
"""

def __init__(self, pack: DataPack, begin: int, end: int):
super().__init__(pack, begin, end)


@dataclass
class Body(Annotation):
"""
A span based annotation `Body`, used to represent the actual content in a piece of clinical note.
"""

def __init__(self, pack: DataPack, begin: int, end: int):
super().__init__(pack, begin, end)
80 changes: 80 additions & 0 deletions tests/wrappers/bio_ner_predictor/mimic3_note_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright 2021 The Forte Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
import logging
from pathlib import Path
from typing import Any, Iterator, Union, List

from smart_open import open

from bio_ner_predictor.demo.clinical import Description, Body
from forte.data.data_pack import DataPack
from forte.data.base_reader import PackReader


class Mimic3DischargeNoteReader(PackReader):
"""This class is designed to read the discharge notes from MIMIC3 dataset
as plain text packs.
For more information for the dataset, visit:
https://mimic.physionet.org/
"""

def __init__(self):
super().__init__()
self.headers: List[str] = []
self.text_col = -1 # Default to be last column.
self.description_col = 0 # Default to be first column.
self.__note_count = 0 # Count number of notes processed.

def _collect( # type: ignore
self, mimic3_path: Union[Path, str]
) -> Iterator[Any]:
with open(mimic3_path) as f:
for r in csv.reader(f):
if 0 < self.configs.max_num_notes <= self.__note_count:
break
yield r

def _parse_pack(self, row: List[str]) -> Iterator[DataPack]:
if len(self.headers) == 0:
self.headers.extend(row)
for i, h in enumerate(self.headers):
if h == "TEXT":
self.text_col = i
logging.info("Text Column is %d", i)
if h == "DESCRIPTION":
self.description_col = i
logging.info("Description Column is %d", i)
else:
pack: DataPack = DataPack()
description: str = row[self.description_col]
text: str = row[self.text_col]
delimiter = "\n-----------------\n"
full_text = description + delimiter + text
pack.set_text(full_text)

Description(pack, 0, len(description))
Body(pack, len(description) + len(delimiter), len(full_text))
self.__note_count += 1
yield pack

@classmethod
def default_configs(cls):
config = super().default_configs()
# If this is set (>0), the reader will only read up to
# the number specified.
config["max_num_notes"] = -1
return config
93 changes: 93 additions & 0 deletions tests/wrappers/bio_ner_predictor_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import sys
import time
import os
import yaml
from bio_ner_predictor.mimic3_note_reader import Mimic3DischargeNoteReader

from fortex.elastic import ElasticSearchPackIndexProcessor
from fortex.huggingface.bio_ner_predictor import BioBERTNERPredictor
from fortex.huggingface.transformers_processor import BERTTokenizer

from forte.common.configuration import Config
from forte.data.data_pack import DataPack
from forte.pipeline import Pipeline
from forte.processors.writers import PackIdJsonPackWriter
from fortex.nltk import NLTKSentenceSegmenter
import unittest
from ddt import ddt, data, unpack
from forte.data.data_utils import maybe_download
from ft.onto.base_ontology import EntityMention

@ddt
class TestBioNerPredictor(unittest.TestCase):
r"""Tests Elastic Indexer."""

def setUp(self):
self.pl = Pipeline[DataPack]()

script_dir_path = os.path.dirname(os.path.abspath(__file__))
data_folder = "bio_ner_predictor"
self.output_path = os.path.join(script_dir_path,data_folder, "test_case_output/")
config_path = os.path.join(script_dir_path,data_folder,"bio_ner_config.yml")
self.input_path = os.path.join(script_dir_path,data_folder, "D_ICD_DIAGNOSES.csv")
self.num_packs = 5

# download resources
urls = [
"https://drive.google.com/file/d/15RSfFkW9syQKtx-_fQ9KshN3BJ27Jf8t/"
"view?usp=sharing",
"https://drive.google.com/file/d/1Nh7D6Xam5JefdoSXRoL7S0DZK1d4i2UK/"
"view?usp=sharing",
"https://drive.google.com/file/d/1YWcI60lGKtTFH01Ai1HnwOKBsrFf2r29/"
"view?usp=sharing",
"https://drive.google.com/file/d/1ElHUEMPQIuWmV0GimroqFphbCvFKskYj/"
"view?usp=sharing",
"https://drive.google.com/file/d/1EhMXlieoEg-bGUbbQ2vN-iyNJvC4Dajl/"
"view?usp=sharing",
]

filenames = [
"config.json",
"pytorch_model.bin",
"special_tokens_map.json",
"tokenizer_config.json",
"vocab.txt",
]
model_path = os.path.abspath("resources/NCBI-disease")
config = yaml.safe_load(open(config_path, "r"))
config = Config(config, default_hparams=None)
config.BERTTokenizer.model_path = model_path
config.BioBERTNERPredictor.model_path = model_path
maybe_download(urls=urls, path=model_path, filenames=filenames)
self.assertTrue(os.path.exists(os.path.join(model_path, "pytorch_model.bin")))
self.pl.set_reader(
Mimic3DischargeNoteReader(), config={"max_num_notes": self.num_packs}
)
self.pl.add(NLTKSentenceSegmenter())



self.pl.add(BERTTokenizer(), config=config.BERTTokenizer)
self.pl.add(BioBERTNERPredictor(), config=config.BioBERTNERPredictor)
self.pl.add(ElasticSearchPackIndexProcessor())
self.pl.add(
PackIdJsonPackWriter(),
{
"output_dir": self.output_path,
"indent": 2,
"overwrite": True,
"drop_record": True,
"zip_pack": True,
},
)
self.pl.initialize()

def test_predict(self):
for idx, data_pack in enumerate(self.pl.process_dataset(self.input_path)):
ems = list(data_pack.get_data(EntityMention))
self.assertTrue(len(ems) > 0)

self.assertEqual(len(os.listdir(self.output_path)), self.num_packs)
for f_name in os.listdir(self.output_path):
os.remove(os.path.join(self.output_path, f_name))
os.removedirs(self.output_path)

0 comments on commit 80cfe19

Please sign in to comment.