Skip to content

Commit

Permalink
Remove Spacy hard dependency on forte medical ontology (#96)
Browse files Browse the repository at this point in the history
* phase 1 spacy changes

* more changes

* spacy now works with config onto

* final updates

* updated default medical onto types

* corrected default type

* resolved PR comments

* cleanuip

* added umls_link case

* cleanup

* formatting

* formatting

* more formatting

* more formatting

* updated elasticsearch version in setup

* updated elasticsearch version in setup

* handle , in tokenize test

* corrected typo

* resolved test case issue

* test issues

* more test issues

* checking with umls test case commented

* checking with umls test commented

* umls_link test case

* more on test case

* test mesh db in entity linking

* add testing config parameter

* cleanup

* added param details

* formatting

* cleanup
  • Loading branch information
Piyush13y authored Mar 11, 2022
1 parent 2c1b268 commit 1af6a3d
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 27 deletions.
4 changes: 2 additions & 2 deletions src/elastic/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
platforms="any",
install_requires=[
"forte==0.1.2",
"elasticsearch>=7.5.1; python_version<'3.9.0'",
"elasticsearch>=7.14.0; python_version>='3.9.0'",
"elasticsearch>=7.5.1, <8.0.0; python_version<'3.9.0'",
"elasticsearch>=7.14.0, <8.0.0; python_version>='3.9.0'",
],
classifiers=[
"Intended Audience :: Developers",
Expand Down
123 changes: 98 additions & 25 deletions src/spacy/fortex/spacy/spacy_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
from forte.data.base_pack import PackType
from forte.data.batchers import ProcessingBatcher, FixedSizeDataPackBatcher
from forte.data.data_pack import DataPack
from forte.data.ontology import Annotation
from forte.data.ontology import Annotation, Generics
from forte.processors.base import PackProcessor, FixedSizeBatchProcessor
from forte.utils import get_class
from ft.onto.base_ontology import EntityMention, Sentence, Token, Dependency
from ftx.medical import MedicalEntityMention, UMLSConceptLink

__all__ = [
"SpacyProcessor",
Expand Down Expand Up @@ -95,6 +95,32 @@ def validate_spacy_configs(configs: Config):
"'sentence' is necessary in configs.processors for 'tokenize'."
)

if "umls_link" in configs.processors:
if not (configs.medical_onto_type and configs.umls_onto_type):
raise ProcessorConfigError(
"Please specify medical and umls link ontology types!"
)

entry_type = get_class(configs.medical_onto_type)
if not isinstance(entry_type, Annotation) and not issubclass(
entry_type, Annotation
):
raise ProcessorConfigError(
"Config parameter {} must be an Annotation type.".format(
configs.medical_onto_type
)
)

entry_type = get_class(configs.umls_onto_type)
if not isinstance(entry_type, Generics) and not issubclass(
entry_type, Generics
):
raise ProcessorConfigError(
"Config parameter {} must be a Generic type.".format(
configs.umls_onto_type
)
)


def set_up_pipe(nlp: Language, configs: Config):
config2component = (
Expand All @@ -121,7 +147,8 @@ def set_up_pipe(nlp: Language, configs: Config):
# pylint: disable=import-outside-toplevel
from scispacy.linking import EntityLinker

linker = EntityLinker(resolve_abbreviations=True, name="umls")
name = "mesh" if configs.testing is True else "umls"
linker = EntityLinker(resolve_abbreviations=True, name=name)
nlp.add_pipe(linker)

# Remove some components to save some time.
Expand Down Expand Up @@ -252,7 +279,13 @@ def pack(
# Record medical entity linking results.
if "umls_link" in self.configs.processors:
linker = self.nlp.get_pipe("EntityLinker") # type: ignore
process_umls_entity_linking(linker, result, pack)
process_umls_entity_linking(
linker,
result,
self.configs.medical_onto_type,
self.configs.umls_onto_type,
pack,
)

def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of current processor
Expand All @@ -271,6 +304,11 @@ def default_configs(cls) -> Dict[str, Any]:
Specify additional parameters for SpaCy processor.
The available parameters are:
- `medical_onto_type`: defines which entry type in the input pack
that the medical entity mentions should be saved as output.
- `umls_onto_type`: defines which entry type in the input pack
that the UMLS concept links should be saved as part of output.
- `batcher.batch_size`: max size of the batch (in terms of number of
data packs).
Expand Down Expand Up @@ -308,8 +346,14 @@ def default_configs(cls) -> Dict[str, Any]:
- `num_processes`: number of processes to run when using `spacy.pipe`.
Default is 1. This will be passed directly to the `n_process` option.
- `testing`: states whether or not the processor is being used in a
test case.
"""
return {
"medical_onto_type": "ftx.medical.clinical_ontology."
+ "MedicalEntityMention",
"umls_onto_type": "ftx.medical.clinical_ontology.UMLSConceptLink",
"batcher": {
"batch_size": 1000,
},
Expand All @@ -319,6 +363,7 @@ def default_configs(cls) -> Dict[str, Any]:
"prefer_gpu": False,
"gpu_id": 0,
"num_processes": 1,
"testing": False,
}


Expand Down Expand Up @@ -394,6 +439,12 @@ def default_configs(cls):
Additional values for this list further includes:
`ner` for named entity and `dep` for dependency parsing.
- `medical_onto_type`: defines which entry type in the input pack
that the medical entity mentions should be saved as output.
- `umls_onto_type`: defines which entry type in the input pack
that the UMLS concept links should be saved as part of output.
- `lang`: language model, default is spaCy `en_core_web_sm` model.
The pipeline support spaCy and ScispaCy models.
A list of available spaCy models could be found at
Expand All @@ -414,14 +465,21 @@ def default_configs(cls):
- `gpu_id`: the GPU device index to use when GPU is enabled. Default
is 0.
- `testing`: states whether or not the processor is being used in a
test case.
Returns: A dictionary with the default config for this processor.
"""
return {
"processors": ["sentence", "tokenize", "pos", "lemma"],
"medical_onto_type": "ftx.medical.clinical_ontology"
+ ".MedicalEntityMention",
"umls_onto_type": "ftx.medical.clinical_ontology.UMLSConceptLink",
"lang": "en_core_web_sm",
"require_gpu": False,
"prefer_gpu": False,
"gpu_id": 0,
"testing": False,
}

def _process(self, input_pack: DataPack):
Expand Down Expand Up @@ -452,7 +510,13 @@ def _process(self, input_pack: DataPack):
# Record medical entity linking results.
if "umls_link" in self.configs.processors:
linker = self.nlp.get_pipe("EntityLinker")
process_umls_entity_linking(linker, result, input_pack)
process_umls_entity_linking(
linker,
result,
self.configs.medical_onto_type,
self.configs.umls_onto_type,
input_pack,
)

def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of current processor
Expand Down Expand Up @@ -480,8 +544,8 @@ def set_records(record_meta: Dict[str, Set[str]], configs: Config):
if "dep" in configs.processors:
record_meta["ft.onto.base_ontology.Dependency"] = {"dep_label"}
if "umls_link" in configs.processors:
record_meta["onto.medical.MedicalEntityMention"] = {"ner_type"}
record_meta["onto.medical.UMLSConceptLink"] = {
record_meta[configs.medical_onto_type] = {"ner_type", "umls_entities"}
record_meta[configs.umls_onto_type] = {
"cui",
"score",
"name",
Expand Down Expand Up @@ -558,7 +622,9 @@ def process_ner(result, input_pack: DataPack):
entity.ner_type = item.label_


def process_umls_entity_linking(linker, result, input_pack: DataPack):
def process_umls_entity_linking(
linker, result, medical_onto_type, umls_onto_type, input_pack: DataPack
):
"""
Perform UMLS medical entity linking with EntityLinker, and store medical
entity mentions and UMLS concepts.
Expand All @@ -575,23 +641,30 @@ def process_umls_entity_linking(linker, result, input_pack: DataPack):

# get medical entity mentions and UMLS concepts
for item in medical_entities:
entity = MedicalEntityMention(
input_pack, item.start_char, item.end_char
medical_entity_name = get_class(medical_onto_type)
medical_entity = medical_entity_name(
pack=input_pack,
begin=item.start_char,
end=item.end_char,
)
entity.ner_type = item.label_

for umls_ent in item._.kb_ents:
cui = umls_ent[0]
score = str(umls_ent[1])

cui_entity = linker.kb.cui_to_entity[cui]

umls = UMLSConceptLink(input_pack)
umls.cui = cui
umls.score = score
umls.name = cui_entity.canonical_name
umls.definition = cui_entity.definition
umls.tuis = cui_entity.types
umls.aliases = cui_entity.aliases
setattr(medical_entity, "ner_type", item.label_)
umls_entity_name = get_class(umls_onto_type)

entity.umls_entities.append(umls)
for umls_ent in item._.kb_ents:
cui_entity = linker.kb.cui_to_entity[umls_ent[0]]
umls = {}
umls["cui"] = umls_ent[0]
umls["score"] = str(umls_ent[1])
umls["name"] = cui_entity.canonical_name
umls["definition"] = cui_entity.definition
umls["tuis"] = cui_entity.types
umls["aliases"] = cui_entity.aliases

umls_entity = umls_entity_name(pack=input_pack)

for attribute, _ in vars(umls_entity).items():
if attribute in umls.keys():
setattr(umls_entity, attribute, umls[attribute])

getattr(medical_entity, "umls_entities").append(umls_entity)
32 changes: 32 additions & 0 deletions tests/wrappers/spacy_processors_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from forte.data.data_pack import DataPack
from forte.data.readers import StringReader
from forte.pipeline import Pipeline
from forte.utils import get_class
from ft.onto.base_ontology import Token, EntityMention, Dependency

from fortex.spacy import SpacyProcessor, SpacyBatchedProcessor
Expand Down Expand Up @@ -142,9 +143,12 @@ def test_spacy_batch_pipeline(self, value):
pipeline.set_reader(StringReader())
config = {
"processors": value,
"medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
"umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
"lang": "en_core_web_sm",
# Language code for the language to build the Pipeline
"batcher": {"batch_size": 2},
"testing": True,
}
pipeline.add(SpacyBatchedProcessor(), config)
pipeline.initialize()
Expand Down Expand Up @@ -178,8 +182,11 @@ def test_spacy_variation_pipeline(self, value):

config = {
"processors": value,
"medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
"umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
"lang": "en_core_web_sm",
# Language code for the language to build the Pipeline
"testing": True,
}
pipeline.add(SpacyProcessor(), config=config)
pipeline.initialize()
Expand All @@ -188,6 +195,7 @@ def test_spacy_variation_pipeline(self, value):
"This tool is called Forte.",
"The goal of this project to help you build NLP pipelines.",
"NLP has never been made this easy before.",
"Head CT revealed no lesions.",
]
document = " ".join(sentences)
pack: DataPack = pipeline.process(document)
Expand All @@ -209,14 +217,38 @@ def test_spacy_processor_with_invalid_config(self, processor):

config = {
"processors": processor,
"medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
"umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
"lang": "en_core_web_sm",
# Language code for the language to build the Pipeline
"testing": True,
}
pipeline.add(SpacyProcessor(), config=config)

with self.assertRaises(ProcessorConfigError):
pipeline.initialize()

@data(
["umls_link"],
)
def test_spacy_processor_for_umls_link(self, processor):
pipeline = Pipeline[DataPack]()
pipeline.set_reader(StringReader())

config = {
"processors": processor,
"medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
"umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
"lang": "en_core_web_sm",
# Language code for the language to build the Pipeline
"testing": True,
}
pipeline.add(SpacyProcessor(), config=config)

try:
pipeline.initialize()
except ProcessorConfigError:
self.fail("umls_link processor failing in Spacy, check config")

if __name__ == "__main__":
unittest.main()

0 comments on commit 1af6a3d

Please sign in to comment.