Remove Spacy hard dependency on forte medical ontology (#96)

* phase 1 spacy changes * more changes * spacy now works with config onto * final updates * updated default medical onto types * corrected default type * resolved PR comments * cleanuip * added umls_link case * cleanup * formatting * formatting * more formatting * more formatting * updated elasticsearch version in setup * updated elasticsearch version in setup * handle , in tokenize test * corrected typo * resolved test case issue * test issues * more test issues * checking with umls test case commented * checking with umls test commented * umls_link test case * more on test case * test mesh db in entity linking * add testing config parameter * cleanup * added param details * formatting * cleanup
asyml · Mar 11, 2022 · 1af6a3d · 1af6a3d
1 parent 2c1b268
commit 1af6a3d
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 27 deletions.
diff --git a/src/elastic/setup.py b/src/elastic/setup.py
@@ -29,8 +29,8 @@
     platforms="any",
     install_requires=[
         "forte==0.1.2",
-        "elasticsearch>=7.5.1; python_version<'3.9.0'",
-        "elasticsearch>=7.14.0; python_version>='3.9.0'",
+        "elasticsearch>=7.5.1, <8.0.0; python_version<'3.9.0'",
+        "elasticsearch>=7.14.0, <8.0.0; python_version>='3.9.0'",
     ],
     classifiers=[
         "Intended Audience :: Developers",

diff --git a/src/spacy/fortex/spacy/spacy_processors.py b/src/spacy/fortex/spacy/spacy_processors.py
@@ -25,10 +25,10 @@
 from forte.data.base_pack import PackType
 from forte.data.batchers import ProcessingBatcher, FixedSizeDataPackBatcher
 from forte.data.data_pack import DataPack
-from forte.data.ontology import Annotation
+from forte.data.ontology import Annotation, Generics
 from forte.processors.base import PackProcessor, FixedSizeBatchProcessor
+from forte.utils import get_class
 from ft.onto.base_ontology import EntityMention, Sentence, Token, Dependency
-from ftx.medical import MedicalEntityMention, UMLSConceptLink
 
 __all__ = [
     "SpacyProcessor",
@@ -95,6 +95,32 @@ def validate_spacy_configs(configs: Config):
                 "'sentence' is necessary in configs.processors for 'tokenize'."
             )
 
+    if "umls_link" in configs.processors:
+        if not (configs.medical_onto_type and configs.umls_onto_type):
+            raise ProcessorConfigError(
+                "Please specify medical and umls link ontology types!"
+            )
+
+        entry_type = get_class(configs.medical_onto_type)
+        if not isinstance(entry_type, Annotation) and not issubclass(
+            entry_type, Annotation
+        ):
+            raise ProcessorConfigError(
+                "Config parameter {} must be an Annotation type.".format(
+                    configs.medical_onto_type
+                )
+            )
+
+        entry_type = get_class(configs.umls_onto_type)
+        if not isinstance(entry_type, Generics) and not issubclass(
+            entry_type, Generics
+        ):
+            raise ProcessorConfigError(
+                "Config parameter {} must be a Generic type.".format(
+                    configs.umls_onto_type
+                )
+            )
+
 
 def set_up_pipe(nlp: Language, configs: Config):
     config2component = (
@@ -121,7 +147,8 @@ def set_up_pipe(nlp: Language, configs: Config):
             # pylint: disable=import-outside-toplevel
             from scispacy.linking import EntityLinker
 
-            linker = EntityLinker(resolve_abbreviations=True, name="umls")
+            name = "mesh" if configs.testing is True else "umls"
+            linker = EntityLinker(resolve_abbreviations=True, name=name)
             nlp.add_pipe(linker)
 
     # Remove some components to save some time.
@@ -252,7 +279,13 @@ def pack(
             # Record medical entity linking results.
             if "umls_link" in self.configs.processors:
                 linker = self.nlp.get_pipe("EntityLinker")  # type: ignore
-                process_umls_entity_linking(linker, result, pack)
+                process_umls_entity_linking(
+                    linker,
+                    result,
+                    self.configs.medical_onto_type,
+                    self.configs.umls_onto_type,
+                    pack,
+                )
 
     def record(self, record_meta: Dict[str, Set[str]]):
         r"""Method to add output type record of current processor
@@ -271,6 +304,11 @@ def default_configs(cls) -> Dict[str, Any]:
         Specify additional parameters for SpaCy processor.
 
         The available parameters are:
+        - `medical_onto_type`: defines which entry type in the input pack
+            that the medical entity mentions should be saved as output.
+
+        - `umls_onto_type`: defines which entry type in the input pack
+            that the UMLS concept links should be saved as part of output.
 
         - `batcher.batch_size`: max size of the batch (in terms of number of
            data packs).
@@ -308,8 +346,14 @@ def default_configs(cls) -> Dict[str, Any]:
         - `num_processes`: number of processes to run when using `spacy.pipe`.
           Default is 1. This will be passed directly to the `n_process` option.
 
+        - `testing`: states whether or not the processor is being used in a
+         test case.
+
         """
         return {
+            "medical_onto_type": "ftx.medical.clinical_ontology."
+            + "MedicalEntityMention",
+            "umls_onto_type": "ftx.medical.clinical_ontology.UMLSConceptLink",
             "batcher": {
                 "batch_size": 1000,
             },
@@ -319,6 +363,7 @@ def default_configs(cls) -> Dict[str, Any]:
             "prefer_gpu": False,
             "gpu_id": 0,
             "num_processes": 1,
+            "testing": False,
         }
 
 
@@ -394,6 +439,12 @@ def default_configs(cls):
           Additional values for this list further includes:
           `ner` for named entity and `dep` for dependency parsing.
 
+        - `medical_onto_type`: defines which entry type in the input pack
+            that the medical entity mentions should be saved as output.
+
+        - `umls_onto_type`: defines which entry type in the input pack
+            that the UMLS concept links should be saved as part of output.
+
         - `lang`: language model, default is spaCy `en_core_web_sm` model.
           The pipeline support spaCy and ScispaCy models.
           A list of available spaCy models could be found at
@@ -414,14 +465,21 @@ def default_configs(cls):
         - `gpu_id`: the GPU device index to use when GPU is enabled. Default
           is 0.
 
+        - `testing`: states whether or not the processor is being used in a
+         test case.
+
         Returns: A dictionary with the default config for this processor.
         """
         return {
             "processors": ["sentence", "tokenize", "pos", "lemma"],
+            "medical_onto_type": "ftx.medical.clinical_ontology"
+            + ".MedicalEntityMention",
+            "umls_onto_type": "ftx.medical.clinical_ontology.UMLSConceptLink",
             "lang": "en_core_web_sm",
             "require_gpu": False,
             "prefer_gpu": False,
             "gpu_id": 0,
+            "testing": False,
         }
 
     def _process(self, input_pack: DataPack):
@@ -452,7 +510,13 @@ def _process(self, input_pack: DataPack):
         # Record medical entity linking results.
         if "umls_link" in self.configs.processors:
             linker = self.nlp.get_pipe("EntityLinker")
-            process_umls_entity_linking(linker, result, input_pack)
+            process_umls_entity_linking(
+                linker,
+                result,
+                self.configs.medical_onto_type,
+                self.configs.umls_onto_type,
+                input_pack,
+            )
 
     def record(self, record_meta: Dict[str, Set[str]]):
         r"""Method to add output type record of current processor
@@ -480,8 +544,8 @@ def set_records(record_meta: Dict[str, Set[str]], configs: Config):
     if "dep" in configs.processors:
         record_meta["ft.onto.base_ontology.Dependency"] = {"dep_label"}
     if "umls_link" in configs.processors:
-        record_meta["onto.medical.MedicalEntityMention"] = {"ner_type"}
-        record_meta["onto.medical.UMLSConceptLink"] = {
+        record_meta[configs.medical_onto_type] = {"ner_type", "umls_entities"}
+        record_meta[configs.umls_onto_type] = {
             "cui",
             "score",
             "name",
@@ -558,7 +622,9 @@ def process_ner(result, input_pack: DataPack):
         entity.ner_type = item.label_
 
 
-def process_umls_entity_linking(linker, result, input_pack: DataPack):
+def process_umls_entity_linking(
+    linker, result, medical_onto_type, umls_onto_type, input_pack: DataPack
+):
     """
     Perform UMLS medical entity linking with EntityLinker, and store medical
     entity mentions and UMLS concepts.
@@ -575,23 +641,30 @@ def process_umls_entity_linking(linker, result, input_pack: DataPack):
 
     # get medical entity mentions and UMLS concepts
     for item in medical_entities:
-        entity = MedicalEntityMention(
-            input_pack, item.start_char, item.end_char
+        medical_entity_name = get_class(medical_onto_type)
+        medical_entity = medical_entity_name(
+            pack=input_pack,
+            begin=item.start_char,
+            end=item.end_char,
         )
-        entity.ner_type = item.label_
-
-        for umls_ent in item._.kb_ents:
-            cui = umls_ent[0]
-            score = str(umls_ent[1])
-
-            cui_entity = linker.kb.cui_to_entity[cui]
 
-            umls = UMLSConceptLink(input_pack)
-            umls.cui = cui
-            umls.score = score
-            umls.name = cui_entity.canonical_name
-            umls.definition = cui_entity.definition
-            umls.tuis = cui_entity.types
-            umls.aliases = cui_entity.aliases
+        setattr(medical_entity, "ner_type", item.label_)
+        umls_entity_name = get_class(umls_onto_type)
 
-            entity.umls_entities.append(umls)
+        for umls_ent in item._.kb_ents:
+            cui_entity = linker.kb.cui_to_entity[umls_ent[0]]
+            umls = {}
+            umls["cui"] = umls_ent[0]
+            umls["score"] = str(umls_ent[1])
+            umls["name"] = cui_entity.canonical_name
+            umls["definition"] = cui_entity.definition
+            umls["tuis"] = cui_entity.types
+            umls["aliases"] = cui_entity.aliases
+
+            umls_entity = umls_entity_name(pack=input_pack)
+
+            for attribute, _ in vars(umls_entity).items():
+                if attribute in umls.keys():
+                    setattr(umls_entity, attribute, umls[attribute])
+
+            getattr(medical_entity, "umls_entities").append(umls_entity)
diff --git a/tests/wrappers/spacy_processors_test.py b/tests/wrappers/spacy_processors_test.py
@@ -25,6 +25,7 @@
 from forte.data.data_pack import DataPack
 from forte.data.readers import StringReader
 from forte.pipeline import Pipeline
+from forte.utils import get_class
 from ft.onto.base_ontology import Token, EntityMention, Dependency
 
 from fortex.spacy import SpacyProcessor, SpacyBatchedProcessor
@@ -142,9 +143,12 @@ def test_spacy_batch_pipeline(self, value):
         pipeline.set_reader(StringReader())
         config = {
             "processors": value,
+            "medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
+            "umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
             "lang": "en_core_web_sm",
             # Language code for the language to build the Pipeline
             "batcher": {"batch_size": 2},
+            "testing": True,
         }
         pipeline.add(SpacyBatchedProcessor(), config)
         pipeline.initialize()
@@ -178,8 +182,11 @@ def test_spacy_variation_pipeline(self, value):
 
         config = {
             "processors": value,
+            "medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
+            "umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
             "lang": "en_core_web_sm",
             # Language code for the language to build the Pipeline
+            "testing": True,
         }
         pipeline.add(SpacyProcessor(), config=config)
         pipeline.initialize()
@@ -188,6 +195,7 @@ def test_spacy_variation_pipeline(self, value):
             "This tool is called Forte.",
             "The goal of this project to help you build NLP pipelines.",
             "NLP has never been made this easy before.",
+            "Head CT revealed no lesions.",
         ]
         document = " ".join(sentences)
         pack: DataPack = pipeline.process(document)
@@ -209,14 +217,38 @@ def test_spacy_processor_with_invalid_config(self, processor):
 
         config = {
             "processors": processor,
+            "medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
+            "umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
             "lang": "en_core_web_sm",
             # Language code for the language to build the Pipeline
+            "testing": True,
         }
         pipeline.add(SpacyProcessor(), config=config)
 
         with self.assertRaises(ProcessorConfigError):
             pipeline.initialize()
 
+    @data(
+        ["umls_link"],
+    )
+    def test_spacy_processor_for_umls_link(self, processor):
+        pipeline = Pipeline[DataPack]()
+        pipeline.set_reader(StringReader())
+
+        config = {
+            "processors": processor,
+            "medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
+            "umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
+            "lang": "en_core_web_sm",
+            # Language code for the language to build the Pipeline
+            "testing": True,
+        }
+        pipeline.add(SpacyProcessor(), config=config)
+
+        try:
+            pipeline.initialize()
+        except ProcessorConfigError:
+            self.fail("umls_link processor failing in Spacy, check config")
 
 if __name__ == "__main__":
     unittest.main()