Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove Spacy hard dependency on forte medical ontology #96

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
ae982fd
phase 1 spacy changes
Piyush13y Mar 3, 2022
70e27d2
more changes
Piyush13y Mar 3, 2022
e31c1e3
spacy now works with config onto
Piyush13y Mar 3, 2022
3edb1cf
final updates
Piyush13y Mar 3, 2022
ee0d994
updated default medical onto types
Piyush13y Mar 3, 2022
bb47629
corrected default type
Piyush13y Mar 3, 2022
12db429
resolved PR comments
Piyush13y Mar 4, 2022
bcdcebf
cleanuip
Piyush13y Mar 4, 2022
598e991
added umls_link case
Piyush13y Mar 4, 2022
9d66120
cleanup
Piyush13y Mar 4, 2022
81066b6
formatting
Piyush13y Mar 4, 2022
23ae79d
formatting
Piyush13y Mar 4, 2022
656e077
more formatting
Piyush13y Mar 4, 2022
afbbef3
more formatting
Piyush13y Mar 4, 2022
ad0f87f
Merge branch 'asyml:main' into 96_remove_spacy_dependency_on_forte_me…
Piyush13y Mar 4, 2022
4ebc702
updated elasticsearch version in setup
Piyush13y Mar 4, 2022
5fd4701
Merge branch '96_remove_spacy_dependency_on_forte_med_onto' of https:…
Piyush13y Mar 4, 2022
eca36f5
updated elasticsearch version in setup
Piyush13y Mar 4, 2022
e4c17c3
handle , in tokenize test
Piyush13y Mar 4, 2022
acfc16f
corrected typo
Piyush13y Mar 5, 2022
c955a6b
resolved test case issue
Piyush13y Mar 10, 2022
3298ea7
Merge branch '96_remove_spacy_dependency_on_forte_med_onto' of https:…
Piyush13y Mar 10, 2022
cb118f7
test issues
Piyush13y Mar 10, 2022
fe4fa57
more test issues
Piyush13y Mar 10, 2022
827d942
checking with umls test case commented
Piyush13y Mar 10, 2022
47c588a
checking with umls test commented
Piyush13y Mar 10, 2022
973ab99
umls_link test case
Piyush13y Mar 11, 2022
a3df873
more on test case
Piyush13y Mar 11, 2022
172bc8c
merge conflicts
Piyush13y Mar 11, 2022
3ad89f4
test mesh db in entity linking
Piyush13y Mar 11, 2022
8f12d70
add testing config parameter
Piyush13y Mar 11, 2022
c3e8bca
cleanup
Piyush13y Mar 11, 2022
1a4b249
added param details
Piyush13y Mar 11, 2022
a848db2
formatting
Piyush13y Mar 11, 2022
2a30438
cleanup
Piyush13y Mar 11, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/elastic/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
platforms="any",
install_requires=[
"forte==0.1.2",
"elasticsearch>=7.5.1; python_version<'3.9.0'",
"elasticsearch>=7.14.0; python_version>='3.9.0'",
"elasticsearch>=7.5.1, <8.0.0; python_version<'3.9.0'",
"elasticsearch>=7.14.0, <8.0.0; python_version>='3.9.0'",
],
classifiers=[
"Intended Audience :: Developers",
Expand Down
123 changes: 98 additions & 25 deletions src/spacy/fortex/spacy/spacy_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
from forte.data.base_pack import PackType
from forte.data.batchers import ProcessingBatcher, FixedSizeDataPackBatcher
from forte.data.data_pack import DataPack
from forte.data.ontology import Annotation
from forte.data.ontology import Annotation, Generics
from forte.processors.base import PackProcessor, FixedSizeBatchProcessor
from forte.utils import get_class
from ft.onto.base_ontology import EntityMention, Sentence, Token, Dependency
from ftx.medical import MedicalEntityMention, UMLSConceptLink

__all__ = [
"SpacyProcessor",
Expand Down Expand Up @@ -95,6 +95,32 @@ def validate_spacy_configs(configs: Config):
"'sentence' is necessary in configs.processors for 'tokenize'."
)

if "umls_link" in configs.processors:
if not (configs.medical_onto_type and configs.umls_onto_type):
raise ProcessorConfigError(
"Please specify medical and umls link ontology types!"
)

entry_type = get_class(configs.medical_onto_type)
if not isinstance(entry_type, Annotation) and not issubclass(
entry_type, Annotation
):
raise ProcessorConfigError(
"Config parameter {} must be an Annotation type.".format(
configs.medical_onto_type
)
)

entry_type = get_class(configs.umls_onto_type)
if not isinstance(entry_type, Generics) and not issubclass(
entry_type, Generics
):
raise ProcessorConfigError(
"Config parameter {} must be a Generic type.".format(
configs.umls_onto_type
)
)


def set_up_pipe(nlp: Language, configs: Config):
config2component = (
Expand All @@ -121,7 +147,8 @@ def set_up_pipe(nlp: Language, configs: Config):
# pylint: disable=import-outside-toplevel
from scispacy.linking import EntityLinker

linker = EntityLinker(resolve_abbreviations=True, name="umls")
name = "mesh" if configs.testing is True else "umls"
linker = EntityLinker(resolve_abbreviations=True, name=name)
nlp.add_pipe(linker)

# Remove some components to save some time.
Expand Down Expand Up @@ -252,7 +279,13 @@ def pack(
# Record medical entity linking results.
if "umls_link" in self.configs.processors:
linker = self.nlp.get_pipe("EntityLinker") # type: ignore
process_umls_entity_linking(linker, result, pack)
process_umls_entity_linking(
linker,
result,
self.configs.medical_onto_type,
self.configs.umls_onto_type,
pack,
)

def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of current processor
Expand All @@ -271,6 +304,11 @@ def default_configs(cls) -> Dict[str, Any]:
Specify additional parameters for SpaCy processor.

The available parameters are:
- `medical_onto_type`: defines which entry type in the input pack
that the medical entity mentions should be saved as output.

- `umls_onto_type`: defines which entry type in the input pack
that the UMLS concept links should be saved as part of output.

- `batcher.batch_size`: max size of the batch (in terms of number of
data packs).
Expand Down Expand Up @@ -308,8 +346,14 @@ def default_configs(cls) -> Dict[str, Any]:
- `num_processes`: number of processes to run when using `spacy.pipe`.
Default is 1. This will be passed directly to the `n_process` option.

- `testing`: states whether or not the processor is being used in a
test case.

"""
return {
"medical_onto_type": "ftx.medical.clinical_ontology."
+ "MedicalEntityMention",
"umls_onto_type": "ftx.medical.clinical_ontology.UMLSConceptLink",
"batcher": {
"batch_size": 1000,
},
Expand All @@ -319,6 +363,7 @@ def default_configs(cls) -> Dict[str, Any]:
"prefer_gpu": False,
"gpu_id": 0,
"num_processes": 1,
"testing": False,
}


Expand Down Expand Up @@ -394,6 +439,12 @@ def default_configs(cls):
Additional values for this list further includes:
`ner` for named entity and `dep` for dependency parsing.

- `medical_onto_type`: defines which entry type in the input pack
that the medical entity mentions should be saved as output.

- `umls_onto_type`: defines which entry type in the input pack
that the UMLS concept links should be saved as part of output.

- `lang`: language model, default is spaCy `en_core_web_sm` model.
The pipeline support spaCy and ScispaCy models.
A list of available spaCy models could be found at
Expand All @@ -414,14 +465,21 @@ def default_configs(cls):
- `gpu_id`: the GPU device index to use when GPU is enabled. Default
is 0.

- `testing`: states whether or not the processor is being used in a
test case.

Returns: A dictionary with the default config for this processor.
"""
return {
"processors": ["sentence", "tokenize", "pos", "lemma"],
"medical_onto_type": "ftx.medical.clinical_ontology"
+ ".MedicalEntityMention",
"umls_onto_type": "ftx.medical.clinical_ontology.UMLSConceptLink",
"lang": "en_core_web_sm",
"require_gpu": False,
"prefer_gpu": False,
"gpu_id": 0,
"testing": False,
}

def _process(self, input_pack: DataPack):
Expand Down Expand Up @@ -452,7 +510,13 @@ def _process(self, input_pack: DataPack):
# Record medical entity linking results.
if "umls_link" in self.configs.processors:
linker = self.nlp.get_pipe("EntityLinker")
process_umls_entity_linking(linker, result, input_pack)
process_umls_entity_linking(
linker,
result,
self.configs.medical_onto_type,
self.configs.umls_onto_type,
input_pack,
)

def record(self, record_meta: Dict[str, Set[str]]):
r"""Method to add output type record of current processor
Expand Down Expand Up @@ -480,8 +544,8 @@ def set_records(record_meta: Dict[str, Set[str]], configs: Config):
if "dep" in configs.processors:
record_meta["ft.onto.base_ontology.Dependency"] = {"dep_label"}
if "umls_link" in configs.processors:
record_meta["onto.medical.MedicalEntityMention"] = {"ner_type"}
record_meta["onto.medical.UMLSConceptLink"] = {
record_meta[configs.medical_onto_type] = {"ner_type", "umls_entities"}
record_meta[configs.umls_onto_type] = {
"cui",
"score",
"name",
Expand Down Expand Up @@ -558,7 +622,9 @@ def process_ner(result, input_pack: DataPack):
entity.ner_type = item.label_


def process_umls_entity_linking(linker, result, input_pack: DataPack):
def process_umls_entity_linking(
linker, result, medical_onto_type, umls_onto_type, input_pack: DataPack
):
"""
Perform UMLS medical entity linking with EntityLinker, and store medical
entity mentions and UMLS concepts.
Expand All @@ -575,23 +641,30 @@ def process_umls_entity_linking(linker, result, input_pack: DataPack):

# get medical entity mentions and UMLS concepts
for item in medical_entities:
entity = MedicalEntityMention(
input_pack, item.start_char, item.end_char
medical_entity_name = get_class(medical_onto_type)
medical_entity = medical_entity_name(
pack=input_pack,
begin=item.start_char,
end=item.end_char,
)
entity.ner_type = item.label_

for umls_ent in item._.kb_ents:
cui = umls_ent[0]
score = str(umls_ent[1])

cui_entity = linker.kb.cui_to_entity[cui]

umls = UMLSConceptLink(input_pack)
umls.cui = cui
umls.score = score
umls.name = cui_entity.canonical_name
umls.definition = cui_entity.definition
umls.tuis = cui_entity.types
umls.aliases = cui_entity.aliases
setattr(medical_entity, "ner_type", item.label_)
umls_entity_name = get_class(umls_onto_type)

entity.umls_entities.append(umls)
for umls_ent in item._.kb_ents:
cui_entity = linker.kb.cui_to_entity[umls_ent[0]]
umls = {}
umls["cui"] = umls_ent[0]
umls["score"] = str(umls_ent[1])
umls["name"] = cui_entity.canonical_name
umls["definition"] = cui_entity.definition
umls["tuis"] = cui_entity.types
umls["aliases"] = cui_entity.aliases

umls_entity = umls_entity_name(pack=input_pack)

for attribute, _ in vars(umls_entity).items():
if attribute in umls.keys():
setattr(umls_entity, attribute, umls[attribute])

getattr(medical_entity, "umls_entities").append(umls_entity)
32 changes: 32 additions & 0 deletions tests/wrappers/spacy_processors_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from forte.data.data_pack import DataPack
from forte.data.readers import StringReader
from forte.pipeline import Pipeline
from forte.utils import get_class
from ft.onto.base_ontology import Token, EntityMention, Dependency

from fortex.spacy import SpacyProcessor, SpacyBatchedProcessor
Expand Down Expand Up @@ -142,9 +143,12 @@ def test_spacy_batch_pipeline(self, value):
pipeline.set_reader(StringReader())
config = {
"processors": value,
"medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
"umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
"lang": "en_core_web_sm",
# Language code for the language to build the Pipeline
"batcher": {"batch_size": 2},
"testing": True,
}
pipeline.add(SpacyBatchedProcessor(), config)
pipeline.initialize()
Expand Down Expand Up @@ -178,8 +182,11 @@ def test_spacy_variation_pipeline(self, value):

config = {
"processors": value,
"medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
"umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
"lang": "en_core_web_sm",
# Language code for the language to build the Pipeline
"testing": True,
}
pipeline.add(SpacyProcessor(), config=config)
pipeline.initialize()
Expand All @@ -188,6 +195,7 @@ def test_spacy_variation_pipeline(self, value):
"This tool is called Forte.",
"The goal of this project to help you build NLP pipelines.",
"NLP has never been made this easy before.",
"Head CT revealed no lesions.",
]
document = " ".join(sentences)
pack: DataPack = pipeline.process(document)
Expand All @@ -209,14 +217,38 @@ def test_spacy_processor_with_invalid_config(self, processor):

config = {
"processors": processor,
"medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
"umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
"lang": "en_core_web_sm",
# Language code for the language to build the Pipeline
"testing": True,
}
pipeline.add(SpacyProcessor(), config=config)

with self.assertRaises(ProcessorConfigError):
pipeline.initialize()

@data(
["umls_link"],
)
def test_spacy_processor_for_umls_link(self, processor):
pipeline = Pipeline[DataPack]()
pipeline.set_reader(StringReader())

config = {
"processors": processor,
"medical_onto_type": "ftx.onto.clinical.MedicalEntityMention",
"umls_onto_type": "ftx.onto.clinical.UMLSConceptLink",
"lang": "en_core_web_sm",
# Language code for the language to build the Pipeline
"testing": True,
}
pipeline.add(SpacyProcessor(), config=config)

try:
pipeline.initialize()
except ProcessorConfigError:
self.fail("umls_link processor failing in Spacy, check config")

if __name__ == "__main__":
unittest.main()