diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 1a0b94305..4bf1c40f8 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -1551,6 +1551,11 @@ def term_metadata(terms, predicates, additional_metadata: bool, output_type: str "-L", help="path to lexical index. This is recreated each time unless --no-recreate is passed", ) +@click.option( + "--match-column", + "-A", + help="name of column to match on (if the input is tsv/csv)", +) @click.option( "--model", "-m", @@ -1567,12 +1572,12 @@ def term_metadata(terms, predicates, additional_metadata: bool, output_type: str @click.option( "--rules-file", "-R", - help="path to rules file. Conforms to https://w3id.org/oak/mapping-rules-datamodel", + help="path to rules file. Conforms to https://w3id.org/oak/mapping-rules", ) @click.option( "--configuration-file", "-C", - help="path to config file. Conforms to https://w3id.org/oak/test-annotation", + help="path to config file. Conforms to https://w3id.org/oak/text-annotator", ) @output_option @output_type_option @@ -1586,18 +1591,32 @@ def annotate( rules_file: str, configuration_file: str, text_file: TextIO, + match_column: str, model: str, output_type: str, ): """ - Annotate a piece of text using a Named Entity Recognition annotation + Annotate a piece of text using a Named Entity Recognition annotation. + + Some endpoints such as BioPortal have built-in support for annotation; + in these cases the endpoint functionality is used: Example: runoak -i bioportal: annotate "enlarged nucleus in T-cells from peripheral blood" - See the ontorunner framework for plugins for SciSpacy and OGER - these will - later become plugins. + For other endpoints, the built-in OAK annotator is used. This currently uses a basic + algorithm based on lexical matching. + + Example: + + runoak -i sqlite:obo:cl annotate "enlarged nucleus in T-cells from peripheral blood" + + Using the builtin annotator can be slow, as the lexical index is re-built every time. + To preserve this, use the ``--lexical-index-file`` (``-L``) option to specify a file to save. + On subsequent iterations the file is reused. + + You can also use ``--text-file`` to pass in a text file to be parsed one line at a time If gilda is installed as an extra, it can be used, but ``--matches-whole-text`` (``-W``) must be specified, @@ -1665,8 +1684,16 @@ def annotate( if words and text_file: raise ValueError("Specify EITHER text-file OR a list of words as arguments") if text_file: - for ann in impl.annotate_file(text_file, configuration): - writer.emit(ann) + if match_column: + writer = _get_writer(output_type, impl, StreamingCsvWriter) + writer.output = output + for row in impl.annotate_tabular_file( + text_file, configuration=configuration, match_column=match_column + ): + writer.emit(row) + else: + for ann in impl.annotate_file(text_file, configuration): + writer.emit(ann) else: logging.info(f"Annotating: {words}") for ann in impl.annotate_text(" ".join(list(words)), configuration): diff --git a/src/oaklib/implementations/translator/translator_implementation.py b/src/oaklib/implementations/translator/translator_implementation.py index 951882b6e..3869350d7 100644 --- a/src/oaklib/implementations/translator/translator_implementation.py +++ b/src/oaklib/implementations/translator/translator_implementation.py @@ -9,12 +9,21 @@ """ import logging from dataclasses import dataclass -from typing import Iterable, Mapping, Optional, Union +from typing import Iterable, List, Mapping, Optional, Union import requests import sssom_schema.datamodel.sssom_schema as sssom -from oaklib.datamodels.vocabulary import SEMAPV, SKOS_CLOSE_MATCH, SKOS_EXACT_MATCH +from oaklib.datamodels.search import SearchConfiguration +from oaklib.datamodels.vocabulary import ( + HAS_RELATED_SYNONYM, + RDFS_LABEL, + SEMAPV, + SKOS_CLOSE_MATCH, + SKOS_EXACT_MATCH, +) +from oaklib.interfaces import SearchInterface +from oaklib.interfaces.basic_ontology_interface import ALIAS_MAP, LANGUAGE_TAG from oaklib.interfaces.mapping_provider_interface import MappingProviderInterface from oaklib.types import CURIE @@ -25,11 +34,13 @@ from oaklib.utilities.mapping.sssom_utils import inject_mapping_sources NODE_NORMALIZER_ENDPOINT = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes" +NAME_NORMALIZER_ENDPOINT = "https://name-resolution-sri.renci.org" @dataclass class TranslatorImplementation( MappingProviderInterface, + SearchInterface, ): """ Wraps Translator endpoints. @@ -94,3 +105,40 @@ def sssom_mappings( def inject_mapping_labels(self, mappings: Iterable[Mapping]) -> None: return + + def basic_search( + self, search_term: str, config: Optional[SearchConfiguration] = None + ) -> Iterable[CURIE]: + r = requests.get( + f"{NAME_NORMALIZER_ENDPOINT}/lookup", + params={"string": search_term, "autocomplete": "true"}, + ) + r.raise_for_status() + results = r.json() + for result in results: + curie = result["curie"] + self.property_cache.add(curie, RDFS_LABEL, result["label"]) + yield curie + + def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]: + if lang: + raise NotImplementedError + if self.property_cache.contains(curie, RDFS_LABEL): + return self.property_cache.get(curie, RDFS_LABEL) + r = requests.get(f"{NAME_NORMALIZER_ENDPOINT}/reverse_lookup", params={"curies": curie}) + r.raise_for_status() + results = r.json() + if curie not in results: + return None + return results[curie]["preferred_name"] + + def entity_aliases(self, curie: CURIE) -> List[str]: + r = requests.get(f"{NAME_NORMALIZER_ENDPOINT}/reverse_lookup", params={"curies": curie}) + r.raise_for_status() + results = r.json() + if curie not in results: + return [] + return results[curie]["names"] + + def entity_alias_map(self, curie: CURIE) -> ALIAS_MAP: + return {HAS_RELATED_SYNONYM: self.entity_aliases(curie)} diff --git a/src/oaklib/interfaces/text_annotator_interface.py b/src/oaklib/interfaces/text_annotator_interface.py index 92f6387ee..e13ad73c2 100644 --- a/src/oaklib/interfaces/text_annotator_interface.py +++ b/src/oaklib/interfaces/text_annotator_interface.py @@ -1,8 +1,9 @@ +import csv import logging from abc import ABC from io import TextIOWrapper from pathlib import Path -from typing import Iterable, Iterator, Optional +from typing import Dict, Iterable, Iterator, Optional from oaklib.datamodels.lexical_index import LexicalIndex from oaklib.datamodels.mapping_rules_datamodel import MappingRuleCollection @@ -190,3 +191,49 @@ def annotate_file( line = line.strip() annotation = self.annotate_text(line, configuration) yield from annotation + + def annotate_tabular_file( + self, + text_file: TextIOWrapper, + delimiter: Optional[str] = None, + configuration: TextAnnotationConfiguration = None, + match_column: str = None, + result_column: str = "matched_id", + result_label_column: str = "matched_label", + match_multiple=False, + include_unmatched=True, + ) -> Iterator[Dict[str, str]]: + """Annotate text in a file. + + :param text_file: Text file that is iterated line-by-line. + :param configuration: Text annotation configuration, defaults to None. + :yield: Annotation of each line. + """ + if not configuration: + configuration = TextAnnotationConfiguration() + if not match_column: + raise ValueError("Must provide a match column") + if not delimiter: + if text_file.name.endswith(".tsv"): + delimiter = "\t" + elif text_file.name.endswith(".csv"): + delimiter = "," + else: + raise ValueError("Must provide a delimiter") + reader = csv.DictReader(text_file, delimiter=delimiter) + for row in reader: + if match_column not in row: + raise ValueError(f"Missing match column {match_column} in {row}") + text = row[match_column] + has_result = False + for ann in self.annotate_text(text, configuration): + row[result_column] = ann.object_id + row[result_label_column] = ann.object_label + has_result = True + yield row + if not match_multiple: + break + if not has_result and include_unmatched: + row[result_column] = "" + row[result_label_column] = "" + yield row