Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing search using translator endpoint. #695

Merged
merged 2 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 34 additions & 7 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1551,6 +1551,11 @@ def term_metadata(terms, predicates, additional_metadata: bool, output_type: str
"-L",
help="path to lexical index. This is recreated each time unless --no-recreate is passed",
)
@click.option(
"--match-column",
"-A",
help="name of column to match on (if the input is tsv/csv)",
)
@click.option(
"--model",
"-m",
Expand All @@ -1567,12 +1572,12 @@ def term_metadata(terms, predicates, additional_metadata: bool, output_type: str
@click.option(
"--rules-file",
"-R",
help="path to rules file. Conforms to https://w3id.org/oak/mapping-rules-datamodel",
help="path to rules file. Conforms to https://w3id.org/oak/mapping-rules",
)
@click.option(
"--configuration-file",
"-C",
help="path to config file. Conforms to https://w3id.org/oak/test-annotation",
help="path to config file. Conforms to https://w3id.org/oak/text-annotator",
)
@output_option
@output_type_option
Expand All @@ -1586,18 +1591,32 @@ def annotate(
rules_file: str,
configuration_file: str,
text_file: TextIO,
match_column: str,
model: str,
output_type: str,
):
"""
Annotate a piece of text using a Named Entity Recognition annotation
Annotate a piece of text using a Named Entity Recognition annotation.

Some endpoints such as BioPortal have built-in support for annotation;
in these cases the endpoint functionality is used:

Example:

runoak -i bioportal: annotate "enlarged nucleus in T-cells from peripheral blood"

See the ontorunner framework for plugins for SciSpacy and OGER - these will
later become plugins.
For other endpoints, the built-in OAK annotator is used. This currently uses a basic
algorithm based on lexical matching.

Example:

runoak -i sqlite:obo:cl annotate "enlarged nucleus in T-cells from peripheral blood"

Using the builtin annotator can be slow, as the lexical index is re-built every time.
To preserve this, use the ``--lexical-index-file`` (``-L``) option to specify a file to save.
On subsequent iterations the file is reused.

You can also use ``--text-file`` to pass in a text file to be parsed one line at a time

If gilda is installed as an extra, it can be used,
but ``--matches-whole-text`` (``-W``) must be specified,
Expand Down Expand Up @@ -1665,8 +1684,16 @@ def annotate(
if words and text_file:
raise ValueError("Specify EITHER text-file OR a list of words as arguments")
if text_file:
for ann in impl.annotate_file(text_file, configuration):
writer.emit(ann)
if match_column:
writer = _get_writer(output_type, impl, StreamingCsvWriter)
writer.output = output
for row in impl.annotate_tabular_file(
text_file, configuration=configuration, match_column=match_column
):
writer.emit(row)
else:
for ann in impl.annotate_file(text_file, configuration):
writer.emit(ann)
else:
logging.info(f"Annotating: {words}")
for ann in impl.annotate_text(" ".join(list(words)), configuration):
Expand Down
52 changes: 50 additions & 2 deletions src/oaklib/implementations/translator/translator_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,21 @@
"""
import logging
from dataclasses import dataclass
from typing import Iterable, Mapping, Optional, Union
from typing import Iterable, List, Mapping, Optional, Union

import requests
import sssom_schema.datamodel.sssom_schema as sssom

from oaklib.datamodels.vocabulary import SEMAPV, SKOS_CLOSE_MATCH, SKOS_EXACT_MATCH
from oaklib.datamodels.search import SearchConfiguration
from oaklib.datamodels.vocabulary import (
HAS_RELATED_SYNONYM,
RDFS_LABEL,
SEMAPV,
SKOS_CLOSE_MATCH,
SKOS_EXACT_MATCH,
)
from oaklib.interfaces import SearchInterface
from oaklib.interfaces.basic_ontology_interface import ALIAS_MAP, LANGUAGE_TAG
from oaklib.interfaces.mapping_provider_interface import MappingProviderInterface
from oaklib.types import CURIE

Expand All @@ -25,11 +34,13 @@
from oaklib.utilities.mapping.sssom_utils import inject_mapping_sources

NODE_NORMALIZER_ENDPOINT = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes"
NAME_NORMALIZER_ENDPOINT = "https://name-resolution-sri.renci.org"


@dataclass
class TranslatorImplementation(
MappingProviderInterface,
SearchInterface,
):
"""
Wraps Translator endpoints.
Expand Down Expand Up @@ -94,3 +105,40 @@ def sssom_mappings(

def inject_mapping_labels(self, mappings: Iterable[Mapping]) -> None:
return

def basic_search(
self, search_term: str, config: Optional[SearchConfiguration] = None
) -> Iterable[CURIE]:
r = requests.get(
f"{NAME_NORMALIZER_ENDPOINT}/lookup",
params={"string": search_term, "autocomplete": "true"},
)
r.raise_for_status()
results = r.json()
for result in results:
curie = result["curie"]
self.property_cache.add(curie, RDFS_LABEL, result["label"])
yield curie

def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]:
if lang:
raise NotImplementedError
if self.property_cache.contains(curie, RDFS_LABEL):
return self.property_cache.get(curie, RDFS_LABEL)
r = requests.get(f"{NAME_NORMALIZER_ENDPOINT}/reverse_lookup", params={"curies": curie})
r.raise_for_status()
results = r.json()
if curie not in results:
return None
return results[curie]["preferred_name"]

def entity_aliases(self, curie: CURIE) -> List[str]:
r = requests.get(f"{NAME_NORMALIZER_ENDPOINT}/reverse_lookup", params={"curies": curie})
r.raise_for_status()
results = r.json()
if curie not in results:
return []
return results[curie]["names"]

def entity_alias_map(self, curie: CURIE) -> ALIAS_MAP:
return {HAS_RELATED_SYNONYM: self.entity_aliases(curie)}
49 changes: 48 additions & 1 deletion src/oaklib/interfaces/text_annotator_interface.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import csv
import logging
from abc import ABC
from io import TextIOWrapper
from pathlib import Path
from typing import Iterable, Iterator, Optional
from typing import Dict, Iterable, Iterator, Optional

from oaklib.datamodels.lexical_index import LexicalIndex
from oaklib.datamodels.mapping_rules_datamodel import MappingRuleCollection
Expand Down Expand Up @@ -190,3 +191,49 @@ def annotate_file(
line = line.strip()
annotation = self.annotate_text(line, configuration)
yield from annotation

def annotate_tabular_file(
self,
text_file: TextIOWrapper,
delimiter: Optional[str] = None,
configuration: TextAnnotationConfiguration = None,
match_column: str = None,
result_column: str = "matched_id",
result_label_column: str = "matched_label",
match_multiple=False,
include_unmatched=True,
) -> Iterator[Dict[str, str]]:
"""Annotate text in a file.

:param text_file: Text file that is iterated line-by-line.
:param configuration: Text annotation configuration, defaults to None.
:yield: Annotation of each line.
"""
if not configuration:
configuration = TextAnnotationConfiguration()
if not match_column:
raise ValueError("Must provide a match column")
if not delimiter:
if text_file.name.endswith(".tsv"):
delimiter = "\t"
elif text_file.name.endswith(".csv"):
delimiter = ","
else:
raise ValueError("Must provide a delimiter")
reader = csv.DictReader(text_file, delimiter=delimiter)
for row in reader:
if match_column not in row:
raise ValueError(f"Missing match column {match_column} in {row}")
text = row[match_column]
has_result = False
for ann in self.annotate_text(text, configuration):
row[result_column] = ann.object_id
row[result_label_column] = ann.object_label
has_result = True
yield row
if not match_multiple:
break
if not has_result and include_unmatched:
row[result_column] = ""
row[result_label_column] = ""
yield row