From 80cfe19926c0596edd13985581e8ca01a7be86ad Mon Sep 17 00:00:00 2001
From: feipenghe <hepengfe@gmail.com>
Date: Wed, 9 Feb 2022 13:14:04 -0800
Subject: [PATCH] Biological NER predictor pack() missing context parameter
 (#85)

---
 .github/workflows/main.yml                    |  5 +-
 .../fortex/huggingface/bio_ner_predictor.py   | 15 +--
 src/huggingface/setup.py                      |  1 +
 .../bio_ner_predictor/D_ICD_DIAGNOSES.csv     | 10 ++
 .../bio_ner_predictor/bio_ner_config.yml      |  7 ++
 .../bio_ner_predictor/demo/.generated         |  0
 .../bio_ner_predictor/demo/__init__.py        |  1 +
 .../bio_ner_predictor/demo/clinical.py        | 49 ++++++++++
 .../bio_ner_predictor/mimic3_note_reader.py   | 80 ++++++++++++++++
 tests/wrappers/bio_ner_predictor_test.py      | 93 +++++++++++++++++++
 10 files changed, 253 insertions(+), 8 deletions(-)
 create mode 100644 tests/wrappers/bio_ner_predictor/D_ICD_DIAGNOSES.csv
 create mode 100644 tests/wrappers/bio_ner_predictor/bio_ner_config.yml
 create mode 100644 tests/wrappers/bio_ner_predictor/demo/.generated
 create mode 100644 tests/wrappers/bio_ner_predictor/demo/__init__.py
 create mode 100644 tests/wrappers/bio_ner_predictor/demo/clinical.py
 create mode 100644 tests/wrappers/bio_ner_predictor/mimic3_note_reader.py
 create mode 100644 tests/wrappers/bio_ner_predictor_test.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ba21d7d..f926bf7 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -90,6 +90,7 @@ jobs:
           - { dep: elastic, testfile: tests/wrappers/elastic_indexers_test.py }
           - { dep: faiss, testfile: tests/wrappers/faiss_indexers_test.py }
           - { dep: "huggingface nltk", extra: "'tensorflow>=2.5.0,<2.8.0'", testfile: tests/wrappers/huggingface }
+          - { dep: "huggingface elastic nltk", testfile: tests/wrappers/bio_ner_predictor_test.py}
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
@@ -127,7 +128,9 @@ jobs:
           cd forte
           pip install --progress-bar off .
       - name: Start elastic server if test elastic search
-        if: ${{ matrix.test-details.dep == 'elastic' }}
+        if: ${{ matrix.test-details.dep == 'elastic' ||
+                contains(matrix.test-details.dep, 'elastic')
+            }}
         run: |
           wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.4.2-linux-x86_64.tar.gz
           tar -zxf elasticsearch-7.4.2-linux-x86_64.tar.gz
diff --git a/src/huggingface/fortex/huggingface/bio_ner_predictor.py b/src/huggingface/fortex/huggingface/bio_ner_predictor.py
index c60294f..16020be 100644
--- a/src/huggingface/fortex/huggingface/bio_ner_predictor.py
+++ b/src/huggingface/fortex/huggingface/bio_ner_predictor.py
@@ -1,11 +1,13 @@
 # pylint: disable=logging-fstring-interpolation
 from typing import Dict, List, Optional, Tuple, Any, Set
-
+import logging
 import numpy as np
 import torch
+
 from forte.common.configuration import Config
 from forte.common.resources import Resources
 from forte.data.data_pack import DataPack
+from forte.data.ontology.top import Annotation
 from forte.processors.base.batch_processor import RequestPackingProcessor
 from ft.onto.base_ontology import EntityMention, Subword
 from transformers import (
@@ -191,15 +193,16 @@ def pack(
         self,
         data_pack: DataPack,
         output_dict: Optional[Dict[str, Dict[str, List[Any]]]] = None,
+        context: Optional[Annotation] = None,
     ):
         """
-        Write the prediction results back to datapack. by writing the predicted
-        ner to the original subwords and convert predictions to something that
-        makes sense in a word-by-word segmentation
+        Write the prediction results back to datapack by aggregating subwords
+        into named entity mentions.
         """
-
         if output_dict is None:
             return
+        if context is not None:
+            logging.warning("context parameter is not used in pack() method.")
 
         for i in range(len(output_dict["Subword"]["tid"])):
             tids = output_dict["Subword"]["tid"][i]
@@ -211,7 +214,6 @@ def pack(
                 for idx, (label, tid) in enumerate(zip(labels, tids))
                 if label not in self.ft_configs.ignore_labels
             ]
-
             entity_groups = self._compose_entities(entities, data_pack, tids)
             # Add NER tags and create EntityMention ontologies.
             for first_idx, last_idx in entity_groups:
@@ -220,7 +222,6 @@ def pack(
 
                 last_token: Subword = data_pack.get_entry(tids[last_idx])
                 end = last_token.span.end
-
                 entity = EntityMention(data_pack, begin, end)
                 entity.ner_type = self.ft_configs.ner_type
 
diff --git a/src/huggingface/setup.py b/src/huggingface/setup.py
index 4922f8d..b699411 100644
--- a/src/huggingface/setup.py
+++ b/src/huggingface/setup.py
@@ -31,6 +31,7 @@
         "forte==0.1.2",
         "more-itertools>=8.0.0",
         "transformers == 4.2.2",
+        "numpy == 1.19.5",
     ],
     classifiers=[
         "Intended Audience :: Developers",
diff --git a/tests/wrappers/bio_ner_predictor/D_ICD_DIAGNOSES.csv b/tests/wrappers/bio_ner_predictor/D_ICD_DIAGNOSES.csv
new file mode 100644
index 0000000..017326e
--- /dev/null
+++ b/tests/wrappers/bio_ner_predictor/D_ICD_DIAGNOSES.csv
@@ -0,0 +1,10 @@
+row_id,icd9_code,short_title,long_title
+1,01716,Erythem nod tb-oth test,"Erythema nodosum with hypersensitivity reaction in tuberculosis, tubercle bacilli not found by bacteriological or histological examination, but tuberculosis confirmed by other methods [inoculation of animals]"
+378,0879,Relapsing fever NOS,"Relapsing fever, unspecified"
+379,0880,Bartonellosis,Bartonellosis
+380,08881,Lyme disease,Lyme Disease
+392,0905,Late congen syph symptom,"Other late congenital syphilis, symptomatic"
+420,09324,Syphil pulmonary valve,Syphilitic endocarditis of pulmonary valve
+434,09486,Syphil acoustic neuritis,Syphilitic acoustic neuritis
+463,09830,Chr gc upper gu NOS,"Chronic gonococcal infection of upper genitourinary tract, site unspecified"
+523,04521,Nonparalyt polio-type 1,"Acute nonparalytic poliomyelitis, poliovirus type I"
\ No newline at end of file
diff --git a/tests/wrappers/bio_ner_predictor/bio_ner_config.yml b/tests/wrappers/bio_ner_predictor/bio_ner_config.yml
new file mode 100644
index 0000000..4672a70
--- /dev/null
+++ b/tests/wrappers/bio_ner_predictor/bio_ner_config.yml
@@ -0,0 +1,7 @@
+BERTTokenizer:
+  model_path: "resources/NCBI-disease"
+
+BioBERTNERPredictor:
+  model_path: "resources/NCBI-disease"
+  ner_type: "DISEASE"
+  ignore_labels: ["O"]
\ No newline at end of file
diff --git a/tests/wrappers/bio_ner_predictor/demo/.generated b/tests/wrappers/bio_ner_predictor/demo/.generated
new file mode 100644
index 0000000..e69de29
diff --git a/tests/wrappers/bio_ner_predictor/demo/__init__.py b/tests/wrappers/bio_ner_predictor/demo/__init__.py
new file mode 100644
index 0000000..49ecbbf
--- /dev/null
+++ b/tests/wrappers/bio_ner_predictor/demo/__init__.py
@@ -0,0 +1 @@
+# ***automatically_generated***
diff --git a/tests/wrappers/bio_ner_predictor/demo/clinical.py b/tests/wrappers/bio_ner_predictor/demo/clinical.py
new file mode 100644
index 0000000..68541b4
--- /dev/null
+++ b/tests/wrappers/bio_ner_predictor/demo/clinical.py
@@ -0,0 +1,49 @@
+# ***automatically_generated***
+# ***source json:examples/clinical_pipeline/clinical_onto.json***
+# flake8: noqa
+# mypy: ignore-errors
+# pylint: skip-file
+"""
+Automatically generated ontology clinical. Do not change manually.
+"""
+
+from dataclasses import dataclass
+from forte.data.data_pack import DataPack
+from forte.data.ontology.top import Annotation
+from ft.onto.base_ontology import EntityMention
+
+__all__ = [
+    "ClinicalEntityMention",
+    "Description",
+    "Body",
+]
+
+
+@dataclass
+class ClinicalEntityMention(EntityMention):
+    """
+    A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text.
+    """
+
+    def __init__(self, pack: DataPack, begin: int, end: int):
+        super().__init__(pack, begin, end)
+
+
+@dataclass
+class Description(Annotation):
+    """
+    A span based annotation `Description`, used to represent the description in a piece of clinical note.
+    """
+
+    def __init__(self, pack: DataPack, begin: int, end: int):
+        super().__init__(pack, begin, end)
+
+
+@dataclass
+class Body(Annotation):
+    """
+    A span based annotation `Body`, used to represent the actual content in a piece of clinical note.
+    """
+
+    def __init__(self, pack: DataPack, begin: int, end: int):
+        super().__init__(pack, begin, end)
diff --git a/tests/wrappers/bio_ner_predictor/mimic3_note_reader.py b/tests/wrappers/bio_ner_predictor/mimic3_note_reader.py
new file mode 100644
index 0000000..1f43f5c
--- /dev/null
+++ b/tests/wrappers/bio_ner_predictor/mimic3_note_reader.py
@@ -0,0 +1,80 @@
+# Copyright 2021 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import logging
+from pathlib import Path
+from typing import Any, Iterator, Union, List
+
+from smart_open import open
+
+from bio_ner_predictor.demo.clinical import Description, Body
+from forte.data.data_pack import DataPack
+from forte.data.base_reader import PackReader
+
+
+class Mimic3DischargeNoteReader(PackReader):
+    """This class is designed to read the discharge notes from MIMIC3 dataset
+    as plain text packs.
+
+    For more information for the dataset, visit:
+      https://mimic.physionet.org/
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.headers: List[str] = []
+        self.text_col = -1  # Default to be last column.
+        self.description_col = 0  # Default to be first column.
+        self.__note_count = 0  # Count number of notes processed.
+
+    def _collect(  # type: ignore
+        self, mimic3_path: Union[Path, str]
+    ) -> Iterator[Any]:
+        with open(mimic3_path) as f:
+            for r in csv.reader(f):
+                if 0 < self.configs.max_num_notes <= self.__note_count:
+                    break
+                yield r
+
+    def _parse_pack(self, row: List[str]) -> Iterator[DataPack]:
+        if len(self.headers) == 0:
+            self.headers.extend(row)
+            for i, h in enumerate(self.headers):
+                if h == "TEXT":
+                    self.text_col = i
+                    logging.info("Text Column is %d", i)
+                if h == "DESCRIPTION":
+                    self.description_col = i
+                    logging.info("Description Column is %d", i)
+        else:
+            pack: DataPack = DataPack()
+            description: str = row[self.description_col]
+            text: str = row[self.text_col]
+            delimiter = "\n-----------------\n"
+            full_text = description + delimiter + text
+            pack.set_text(full_text)
+
+            Description(pack, 0, len(description))
+            Body(pack, len(description) + len(delimiter), len(full_text))
+            self.__note_count += 1
+            yield pack
+
+    @classmethod
+    def default_configs(cls):
+        config = super().default_configs()
+        # If this is set (>0), the reader will only read up to
+        # the number specified.
+        config["max_num_notes"] = -1
+        return config
diff --git a/tests/wrappers/bio_ner_predictor_test.py b/tests/wrappers/bio_ner_predictor_test.py
new file mode 100644
index 0000000..e548e2c
--- /dev/null
+++ b/tests/wrappers/bio_ner_predictor_test.py
@@ -0,0 +1,93 @@
+import sys
+import time
+import os
+import yaml
+from bio_ner_predictor.mimic3_note_reader import Mimic3DischargeNoteReader
+
+from fortex.elastic import ElasticSearchPackIndexProcessor
+from fortex.huggingface.bio_ner_predictor import BioBERTNERPredictor
+from fortex.huggingface.transformers_processor import BERTTokenizer
+
+from forte.common.configuration import Config
+from forte.data.data_pack import DataPack
+from forte.pipeline import Pipeline
+from forte.processors.writers import PackIdJsonPackWriter
+from fortex.nltk import NLTKSentenceSegmenter
+import unittest
+from ddt import ddt, data, unpack
+from forte.data.data_utils import maybe_download
+from ft.onto.base_ontology import EntityMention
+
+@ddt
+class TestBioNerPredictor(unittest.TestCase):
+    r"""Tests Elastic Indexer."""
+
+    def setUp(self):
+        self.pl = Pipeline[DataPack]()
+
+        script_dir_path = os.path.dirname(os.path.abspath(__file__))
+        data_folder = "bio_ner_predictor"
+        self.output_path = os.path.join(script_dir_path,data_folder, "test_case_output/")
+        config_path = os.path.join(script_dir_path,data_folder,"bio_ner_config.yml")
+        self.input_path = os.path.join(script_dir_path,data_folder, "D_ICD_DIAGNOSES.csv")
+        self.num_packs = 5
+
+        # download resources
+        urls = [
+            "https://drive.google.com/file/d/15RSfFkW9syQKtx-_fQ9KshN3BJ27Jf8t/"
+            "view?usp=sharing",
+            "https://drive.google.com/file/d/1Nh7D6Xam5JefdoSXRoL7S0DZK1d4i2UK/"
+            "view?usp=sharing",
+            "https://drive.google.com/file/d/1YWcI60lGKtTFH01Ai1HnwOKBsrFf2r29/"
+            "view?usp=sharing",
+            "https://drive.google.com/file/d/1ElHUEMPQIuWmV0GimroqFphbCvFKskYj/"
+            "view?usp=sharing",
+            "https://drive.google.com/file/d/1EhMXlieoEg-bGUbbQ2vN-iyNJvC4Dajl/"
+            "view?usp=sharing",
+        ]
+
+        filenames = [
+            "config.json",
+            "pytorch_model.bin",
+            "special_tokens_map.json",
+            "tokenizer_config.json",
+            "vocab.txt",
+        ]
+        model_path = os.path.abspath("resources/NCBI-disease")
+        config = yaml.safe_load(open(config_path, "r"))
+        config = Config(config, default_hparams=None)
+        config.BERTTokenizer.model_path = model_path
+        config.BioBERTNERPredictor.model_path = model_path
+        maybe_download(urls=urls, path=model_path, filenames=filenames)
+        self.assertTrue(os.path.exists(os.path.join(model_path, "pytorch_model.bin")))
+        self.pl.set_reader(
+            Mimic3DischargeNoteReader(), config={"max_num_notes": self.num_packs}
+        )
+        self.pl.add(NLTKSentenceSegmenter())
+        
+
+        
+        self.pl.add(BERTTokenizer(), config=config.BERTTokenizer)
+        self.pl.add(BioBERTNERPredictor(), config=config.BioBERTNERPredictor)
+        self.pl.add(ElasticSearchPackIndexProcessor())
+        self.pl.add(
+            PackIdJsonPackWriter(),
+            {
+                "output_dir": self.output_path,
+                "indent": 2,
+                "overwrite": True,
+                "drop_record": True,
+                "zip_pack": True,
+            },
+        )
+        self.pl.initialize()
+
+    def test_predict(self):
+        for idx, data_pack in enumerate(self.pl.process_dataset(self.input_path)):
+            ems = list(data_pack.get_data(EntityMention))
+            self.assertTrue(len(ems) > 0)
+
+        self.assertEqual(len(os.listdir(self.output_path)), self.num_packs)
+        for f_name in os.listdir(self.output_path):
+            os.remove(os.path.join(self.output_path, f_name))
+        os.removedirs(self.output_path)