INCATools · cmungall · Jan 26, 2024 · Jan 19, 2024 · Jan 20, 2024
diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py
@@ -1551,6 +1551,11 @@ def term_metadata(terms, predicates, additional_metadata: bool, output_type: str
     "-L",
     help="path to lexical index. This is recreated each time unless --no-recreate is passed",
 )
+@click.option(
+    "--match-column",
+    "-A",
+    help="name of column to match on (if the input is tsv/csv)",
+)
 @click.option(
     "--model",
     "-m",
@@ -1567,12 +1572,12 @@ def term_metadata(terms, predicates, additional_metadata: bool, output_type: str
 @click.option(
     "--rules-file",
     "-R",
-    help="path to rules file. Conforms to https://w3id.org/oak/mapping-rules-datamodel",
+    help="path to rules file. Conforms to https://w3id.org/oak/mapping-rules",
 )
 @click.option(
     "--configuration-file",
     "-C",
-    help="path to config file. Conforms to https://w3id.org/oak/test-annotation",
+    help="path to config file. Conforms to https://w3id.org/oak/text-annotator",
 )
 @output_option
 @output_type_option
@@ -1586,18 +1591,32 @@ def annotate(
     rules_file: str,
     configuration_file: str,
     text_file: TextIO,
+    match_column: str,
     model: str,
     output_type: str,
 ):
     """
-    Annotate a piece of text using a Named Entity Recognition annotation
+    Annotate a piece of text using a Named Entity Recognition annotation.
+
+    Some endpoints such as BioPortal have built-in support for annotation;
+    in these cases the endpoint functionality is used:
 
     Example:
 
         runoak -i bioportal: annotate "enlarged nucleus in T-cells from peripheral blood"
 
-    See the ontorunner framework for plugins for SciSpacy and OGER - these will
-    later become plugins.
+    For other endpoints, the built-in OAK annotator is used. This currently uses a basic
+    algorithm based on lexical matching.
+
+     Example:
+
+        runoak -i sqlite:obo:cl annotate "enlarged nucleus in T-cells from peripheral blood"
+
+    Using the builtin annotator can be slow, as the lexical index is re-built every time.
+    To preserve this, use the ``--lexical-index-file`` (``-L``) option to specify a file to save.
+    On subsequent iterations the file is reused.
+
+    You can also use ``--text-file`` to pass in a text file to be parsed one line at a time
 
     If gilda is installed as an extra, it can be used,
     but ``--matches-whole-text`` (``-W``) must be specified,
@@ -1665,8 +1684,16 @@ def annotate(
     if words and text_file:
         raise ValueError("Specify EITHER text-file OR a list of words as arguments")
     if text_file:
-        for ann in impl.annotate_file(text_file, configuration):
-            writer.emit(ann)
+        if match_column:
+            writer = _get_writer(output_type, impl, StreamingCsvWriter)
+            writer.output = output
+            for row in impl.annotate_tabular_file(
+                text_file, configuration=configuration, match_column=match_column
+            ):
+                writer.emit(row)
+        else:
+            for ann in impl.annotate_file(text_file, configuration):
+                writer.emit(ann)
     else:
         logging.info(f"Annotating: {words}")
         for ann in impl.annotate_text(" ".join(list(words)), configuration):

diff --git a/src/oaklib/implementations/translator/translator_implementation.py b/src/oaklib/implementations/translator/translator_implementation.py
@@ -9,12 +9,21 @@
 """
 import logging
 from dataclasses import dataclass
-from typing import Iterable, Mapping, Optional, Union
+from typing import Iterable, List, Mapping, Optional, Union
 
 import requests
 import sssom_schema.datamodel.sssom_schema as sssom
 
-from oaklib.datamodels.vocabulary import SEMAPV, SKOS_CLOSE_MATCH, SKOS_EXACT_MATCH
+from oaklib.datamodels.search import SearchConfiguration
+from oaklib.datamodels.vocabulary import (
+    HAS_RELATED_SYNONYM,
+    RDFS_LABEL,
+    SEMAPV,
+    SKOS_CLOSE_MATCH,
+    SKOS_EXACT_MATCH,
+)
+from oaklib.interfaces import SearchInterface
+from oaklib.interfaces.basic_ontology_interface import ALIAS_MAP, LANGUAGE_TAG
 from oaklib.interfaces.mapping_provider_interface import MappingProviderInterface
 from oaklib.types import CURIE
 
@@ -25,11 +34,13 @@
 from oaklib.utilities.mapping.sssom_utils import inject_mapping_sources
 
 NODE_NORMALIZER_ENDPOINT = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes"
+NAME_NORMALIZER_ENDPOINT = "https://name-resolution-sri.renci.org"
 
 
 @dataclass
 class TranslatorImplementation(
     MappingProviderInterface,
+    SearchInterface,
 ):
     """
     Wraps Translator endpoints.
@@ -94,3 +105,40 @@ def sssom_mappings(
 
     def inject_mapping_labels(self, mappings: Iterable[Mapping]) -> None:
         return
+
+    def basic_search(
+        self, search_term: str, config: Optional[SearchConfiguration] = None
+    ) -> Iterable[CURIE]:
+        r = requests.get(
+            f"{NAME_NORMALIZER_ENDPOINT}/lookup",
+            params={"string": search_term, "autocomplete": "true"},
+        )
+        r.raise_for_status()
+        results = r.json()
+        for result in results:
+            curie = result["curie"]
+            self.property_cache.add(curie, RDFS_LABEL, result["label"])
+            yield curie
+
+    def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]:
+        if lang:
+            raise NotImplementedError
+        if self.property_cache.contains(curie, RDFS_LABEL):
+            return self.property_cache.get(curie, RDFS_LABEL)
+        r = requests.get(f"{NAME_NORMALIZER_ENDPOINT}/reverse_lookup", params={"curies": curie})
+        r.raise_for_status()
+        results = r.json()
+        if curie not in results:
+            return None
+        return results[curie]["preferred_name"]
+
+    def entity_aliases(self, curie: CURIE) -> List[str]:
+        r = requests.get(f"{NAME_NORMALIZER_ENDPOINT}/reverse_lookup", params={"curies": curie})
+        r.raise_for_status()
+        results = r.json()
+        if curie not in results:
+            return []
+        return results[curie]["names"]
+
+    def entity_alias_map(self, curie: CURIE) -> ALIAS_MAP:
+        return {HAS_RELATED_SYNONYM: self.entity_aliases(curie)}
diff --git a/src/oaklib/interfaces/text_annotator_interface.py b/src/oaklib/interfaces/text_annotator_interface.py
@@ -1,8 +1,9 @@
+import csv
 import logging
 from abc import ABC
 from io import TextIOWrapper
 from pathlib import Path
-from typing import Iterable, Iterator, Optional
+from typing import Dict, Iterable, Iterator, Optional
 
 from oaklib.datamodels.lexical_index import LexicalIndex
 from oaklib.datamodels.mapping_rules_datamodel import MappingRuleCollection
@@ -190,3 +191,49 @@ def annotate_file(
             line = line.strip()
             annotation = self.annotate_text(line, configuration)
             yield from annotation
+
+    def annotate_tabular_file(
+        self,
+        text_file: TextIOWrapper,
+        delimiter: Optional[str] = None,
+        configuration: TextAnnotationConfiguration = None,
+        match_column: str = None,
+        result_column: str = "matched_id",
+        result_label_column: str = "matched_label",
+        match_multiple=False,
+        include_unmatched=True,
+    ) -> Iterator[Dict[str, str]]:
+        """Annotate text in a file.
+
+        :param text_file: Text file that is iterated line-by-line.
+        :param configuration: Text annotation configuration, defaults to None.
+        :yield: Annotation of each line.
+        """
+        if not configuration:
+            configuration = TextAnnotationConfiguration()
+        if not match_column:
+            raise ValueError("Must provide a match column")
+        if not delimiter:
+            if text_file.name.endswith(".tsv"):
+                delimiter = "\t"
+            elif text_file.name.endswith(".csv"):
+                delimiter = ","
+            else:
+                raise ValueError("Must provide a delimiter")
+        reader = csv.DictReader(text_file, delimiter=delimiter)
+        for row in reader:
+            if match_column not in row:
+                raise ValueError(f"Missing match column {match_column} in {row}")
+            text = row[match_column]
+            has_result = False
+            for ann in self.annotate_text(text, configuration):
+                row[result_column] = ann.object_id
+                row[result_label_column] = ann.object_label
+                has_result = True
+                yield row
+                if not match_multiple:
+                    break
+            if not has_result and include_unmatched:
+                row[result_column] = ""
+                row[result_label_column] = ""
+                yield row