Skip to content

Commit

Permalink
Implementing search using translator endpoint. (#695)
Browse files Browse the repository at this point in the history
* Implementing search using translator endpoint.

Adding annotatate-file option to cli

* lint
  • Loading branch information
cmungall authored Jan 26, 2024
1 parent d139e99 commit 6a37e68
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 10 deletions.
41 changes: 34 additions & 7 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1551,6 +1551,11 @@ def term_metadata(terms, predicates, additional_metadata: bool, output_type: str
"-L",
help="path to lexical index. This is recreated each time unless --no-recreate is passed",
)
@click.option(
"--match-column",
"-A",
help="name of column to match on (if the input is tsv/csv)",
)
@click.option(
"--model",
"-m",
Expand All @@ -1567,12 +1572,12 @@ def term_metadata(terms, predicates, additional_metadata: bool, output_type: str
@click.option(
"--rules-file",
"-R",
help="path to rules file. Conforms to https://w3id.org/oak/mapping-rules-datamodel",
help="path to rules file. Conforms to https://w3id.org/oak/mapping-rules",
)
@click.option(
"--configuration-file",
"-C",
help="path to config file. Conforms to https://w3id.org/oak/test-annotation",
help="path to config file. Conforms to https://w3id.org/oak/text-annotator",
)
@output_option
@output_type_option
Expand All @@ -1586,18 +1591,32 @@ def annotate(
rules_file: str,
configuration_file: str,
text_file: TextIO,
match_column: str,
model: str,
output_type: str,
):
"""
Annotate a piece of text using a Named Entity Recognition annotation
Annotate a piece of text using a Named Entity Recognition annotation.
Some endpoints such as BioPortal have built-in support for annotation;
in these cases the endpoint functionality is used:
Example:
runoak -i bioportal: annotate "enlarged nucleus in T-cells from peripheral blood"
See the ontorunner framework for plugins for SciSpacy and OGER - these will
later become plugins.
For other endpoints, the built-in OAK annotator is used. This currently uses a basic
algorithm based on lexical matching.
Example:
runoak -i sqlite:obo:cl annotate "enlarged nucleus in T-cells from peripheral blood"
Using the builtin annotator can be slow, as the lexical index is re-built every time.
To preserve this, use the ``--lexical-index-file`` (``-L``) option to specify a file to save.
On subsequent iterations the file is reused.
You can also use ``--text-file`` to pass in a text file to be parsed one line at a time
If gilda is installed as an extra, it can be used,
but ``--matches-whole-text`` (``-W``) must be specified,
Expand Down Expand Up @@ -1665,8 +1684,16 @@ def annotate(
if words and text_file:
raise ValueError("Specify EITHER text-file OR a list of words as arguments")
if text_file:
for ann in impl.annotate_file(text_file, configuration):
writer.emit(ann)
if match_column:
writer = _get_writer(output_type, impl, StreamingCsvWriter)
writer.output = output
for row in impl.annotate_tabular_file(
text_file, configuration=configuration, match_column=match_column
):
writer.emit(row)
else:
for ann in impl.annotate_file(text_file, configuration):
writer.emit(ann)
else:
logging.info(f"Annotating: {words}")
for ann in impl.annotate_text(" ".join(list(words)), configuration):
Expand Down
52 changes: 50 additions & 2 deletions src/oaklib/implementations/translator/translator_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,21 @@
"""
import logging
from dataclasses import dataclass
from typing import Iterable, Mapping, Optional, Union
from typing import Iterable, List, Mapping, Optional, Union

import requests
import sssom_schema.datamodel.sssom_schema as sssom

from oaklib.datamodels.vocabulary import SEMAPV, SKOS_CLOSE_MATCH, SKOS_EXACT_MATCH
from oaklib.datamodels.search import SearchConfiguration
from oaklib.datamodels.vocabulary import (
HAS_RELATED_SYNONYM,
RDFS_LABEL,
SEMAPV,
SKOS_CLOSE_MATCH,
SKOS_EXACT_MATCH,
)
from oaklib.interfaces import SearchInterface
from oaklib.interfaces.basic_ontology_interface import ALIAS_MAP, LANGUAGE_TAG
from oaklib.interfaces.mapping_provider_interface import MappingProviderInterface
from oaklib.types import CURIE

Expand All @@ -25,11 +34,13 @@
from oaklib.utilities.mapping.sssom_utils import inject_mapping_sources

NODE_NORMALIZER_ENDPOINT = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes"
NAME_NORMALIZER_ENDPOINT = "https://name-resolution-sri.renci.org"


@dataclass
class TranslatorImplementation(
MappingProviderInterface,
SearchInterface,
):
"""
Wraps Translator endpoints.
Expand Down Expand Up @@ -94,3 +105,40 @@ def sssom_mappings(

def inject_mapping_labels(self, mappings: Iterable[Mapping]) -> None:
return

def basic_search(
self, search_term: str, config: Optional[SearchConfiguration] = None
) -> Iterable[CURIE]:
r = requests.get(
f"{NAME_NORMALIZER_ENDPOINT}/lookup",
params={"string": search_term, "autocomplete": "true"},
)
r.raise_for_status()
results = r.json()
for result in results:
curie = result["curie"]
self.property_cache.add(curie, RDFS_LABEL, result["label"])
yield curie

def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]:
if lang:
raise NotImplementedError
if self.property_cache.contains(curie, RDFS_LABEL):
return self.property_cache.get(curie, RDFS_LABEL)
r = requests.get(f"{NAME_NORMALIZER_ENDPOINT}/reverse_lookup", params={"curies": curie})
r.raise_for_status()
results = r.json()
if curie not in results:
return None
return results[curie]["preferred_name"]

def entity_aliases(self, curie: CURIE) -> List[str]:
r = requests.get(f"{NAME_NORMALIZER_ENDPOINT}/reverse_lookup", params={"curies": curie})
r.raise_for_status()
results = r.json()
if curie not in results:
return []
return results[curie]["names"]

def entity_alias_map(self, curie: CURIE) -> ALIAS_MAP:
return {HAS_RELATED_SYNONYM: self.entity_aliases(curie)}
49 changes: 48 additions & 1 deletion src/oaklib/interfaces/text_annotator_interface.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import csv
import logging
from abc import ABC
from io import TextIOWrapper
from pathlib import Path
from typing import Iterable, Iterator, Optional
from typing import Dict, Iterable, Iterator, Optional

from oaklib.datamodels.lexical_index import LexicalIndex
from oaklib.datamodels.mapping_rules_datamodel import MappingRuleCollection
Expand Down Expand Up @@ -190,3 +191,49 @@ def annotate_file(
line = line.strip()
annotation = self.annotate_text(line, configuration)
yield from annotation

def annotate_tabular_file(
self,
text_file: TextIOWrapper,
delimiter: Optional[str] = None,
configuration: TextAnnotationConfiguration = None,
match_column: str = None,
result_column: str = "matched_id",
result_label_column: str = "matched_label",
match_multiple=False,
include_unmatched=True,
) -> Iterator[Dict[str, str]]:
"""Annotate text in a file.
:param text_file: Text file that is iterated line-by-line.
:param configuration: Text annotation configuration, defaults to None.
:yield: Annotation of each line.
"""
if not configuration:
configuration = TextAnnotationConfiguration()
if not match_column:
raise ValueError("Must provide a match column")
if not delimiter:
if text_file.name.endswith(".tsv"):
delimiter = "\t"
elif text_file.name.endswith(".csv"):
delimiter = ","
else:
raise ValueError("Must provide a delimiter")
reader = csv.DictReader(text_file, delimiter=delimiter)
for row in reader:
if match_column not in row:
raise ValueError(f"Missing match column {match_column} in {row}")
text = row[match_column]
has_result = False
for ann in self.annotate_text(text, configuration):
row[result_column] = ann.object_id
row[result_label_column] = ann.object_label
has_result = True
yield row
if not match_multiple:
break
if not has_result and include_unmatched:
row[result_column] = ""
row[result_label_column] = ""
yield row

0 comments on commit 6a37e68

Please sign in to comment.