Adding llm as an extra, adding llm implementation for annotation

INCATools · Sep 22, 2023 · 19c60e3 · 19c60e3
1 parent 454360c
commit 19c60e3
Show file tree

Hide file tree

Showing 6 changed files with 1,235 additions and 480 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ kgcl-schema = "0.6.0"
 funowl = ">=0.2.0"
 gilda = {version = ">=1.0.0", optional = true}
 kgcl-rdflib = "0.5.0"
+llm = {version = "*", optional = true}
 pystow = ">=0.5.0"
 class-resolver = ">=0.4.2"
 ontoportal-client = ">=0.0.3"
@@ -68,6 +69,7 @@ boomerang = "oaklib.utilities.mapping.boomer_utils:main"
 [tool.poetry.extras]
 docs = ["Sphinx", "sphinx-rtd-theme", "sphinxcontrib-mermaid", "sphinx-copybutton"]
 gilda = ["scipy", "gilda", "urllib3"]
+llm = ["llm"]
 seaborn = ["seaborn"]
 
 [tool.black]

diff --git a/src/oaklib/datamodels/text_annotator.py b/src/oaklib/datamodels/text_annotator.py
@@ -1,15 +1,13 @@
-# Auto generated from text_annotator.yaml by pythongen.py version: 0.9.0
-# Generation date: 2023-04-09T15:53:59
+# Auto generated from text_annotator.yaml by pythongen.py version: 0.0.1
+# Generation date: 2023-09-16T18:49:46
 # Schema: text-annotator
 #
 # id: https://w3id.org/oak/text_annotator
-# description: A datamodel for representing the results of textual named entity recognition annotation results.
-#              This draws upon both SSSOM and https://www.w3.org/TR/annotation-model/
+# description: A datamodel for representing the results of textual named entity recognition annotation results. This draws upon both SSSOM and https://www.w3.org/TR/annotation-model/
 # license: https://creativecommons.org/publicdomain/zero/1.0/
 
 import dataclasses
 import re
-import sys
 from dataclasses import dataclass
 from typing import Any, ClassVar, Dict, List, Optional, Union
 
@@ -102,6 +100,7 @@ class TextAnnotationConfiguration(YAMLRoot):
     sources: Optional[Union[str, List[str]]] = empty_list()
     limit: Optional[int] = None
     token_exclusion_list: Optional[Union[str, List[str]]] = empty_list()
+    categories: Optional[Union[str, List[str]]] = empty_list()
     model: Optional[str] = None
     include_aliases: Optional[Union[bool, Bool]] = None
 
@@ -124,6 +123,10 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
             v if isinstance(v, str) else str(v) for v in self.token_exclusion_list
         ]
 
+        if not isinstance(self.categories, list):
+            self.categories = [self.categories] if self.categories is not None else []
+        self.categories = [v if isinstance(v, str) else str(v) for v in self.categories]
+
         if self.model is not None and not isinstance(self.model, str):
             self.model = str(self.model)
 
@@ -245,6 +248,7 @@ class TextAnnotation(YAMLRoot):
     predicate_id: Optional[str] = None
     object_id: Optional[str] = None
     object_label: Optional[str] = None
+    object_categories: Optional[Union[str, List[str]]] = empty_list()
     object_source: Optional[str] = None
     confidence: Optional[float] = None
     match_string: Optional[str] = None
@@ -269,6 +273,14 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
         if self.object_label is not None and not isinstance(self.object_label, str):
             self.object_label = str(self.object_label)
 
+        if not isinstance(self.object_categories, list):
+            self.object_categories = (
+                [self.object_categories] if self.object_categories is not None else []
+            )
+        self.object_categories = [
+            v if isinstance(v, str) else str(v) for v in self.object_categories
+        ]
+
         if self.object_source is not None and not isinstance(self.object_source, str):
             self.object_source = str(self.object_source)
 
@@ -395,6 +407,15 @@ class slots:
     range=Optional[Union[str, List[str]]],
 )
 
+slots.textAnnotationConfiguration__categories = Slot(
+    uri=ANN.categories,
+    name="textAnnotationConfiguration__categories",
+    curie=ANN.curie("categories"),
+    model_uri=ANN.textAnnotationConfiguration__categories,
+    domain=None,
+    range=Optional[Union[str, List[str]]],
+)
+
 slots.textAnnotationConfiguration__model = Slot(
     uri=ANN.model,
     name="textAnnotationConfiguration__model",
@@ -530,6 +551,15 @@ class slots:
     range=Optional[str],
 )
 
+slots.textAnnotation__object_categories = Slot(
+    uri=ANN.object_categories,
+    name="textAnnotation__object_categories",
+    curie=ANN.curie("object_categories"),
+    model_uri=ANN.textAnnotation__object_categories,
+    domain=None,
+    range=Optional[Union[str, List[str]]],
+)
+
 slots.textAnnotation__object_source = Slot(
     uri=SSSOM.object_source,
     name="textAnnotation__object_source",

diff --git a/src/oaklib/datamodels/text_annotator.yaml b/src/oaklib/datamodels/text_annotator.yaml
@@ -48,17 +48,33 @@ classes:
     description: configuration for search
     attributes:
       matches_whole_text:
+        aliases:
+          - grounding_mode
         range: boolean
+        description: >-
+          If true, then only grounding is performed, and the entire text is used as the match string.
       sources:
         multivalued: true
       limit:
         range: integer
+        description: >-
+          The maximum number of annotations to return
       token_exclusion_list:
         multivalued: true
+        description: >-
+          A list of tokens to exclude from the annotation process
+      categories:
+        multivalued: true
+        description: >-
+          A list of named entity categories to include.
       model:
         range: string
+        description: >-
+          The name of the model to use for annotation. The specifics of this are implementation-dependent.
       include_aliases:
         range: boolean
+        description: >-
+          If true, then the aliases (synonyms) of the matched entity are included in the annotation results.
 
   TextAnnotationResultSet:
     description: A collection of annotation results
@@ -119,6 +135,8 @@ classes:
         slot_uri: sssom:object_label
         exact_mappings:
           - bpa:annotatedClass.prefLabel
+      object_categories:
+        multivalued: true
       object_source:
         slot_uri: sssom:object_source
       confidence:

diff --git a/src/oaklib/implementations/__init__.py b/src/oaklib/implementations/__init__.py
@@ -15,6 +15,7 @@
 from oaklib.implementations.funowl.funowl_implementation import FunOwlImplementation
 from oaklib.implementations.gilda import GildaImplementation
 from oaklib.implementations.kgx.kgx_implementation import KGXImplementation
+from oaklib.implementations.llm_implementation import LLMImplementation
 from oaklib.implementations.monarch.monarch_implementation import MonarchImplementation
 from oaklib.implementations.ols import (
     BaseOlsImplementation,
@@ -87,6 +88,7 @@
     "PubMedImplementation",
     "FunOwlImplementation",
     "GildaImplementation",
+    "LLMImplementation",
     "KGXImplementation",
     "UniprotImplementation",
     "TranslatorImplementation",

diff --git a/src/oaklib/implementations/llm_implementation.py b/src/oaklib/implementations/llm_implementation.py
@@ -0,0 +1,145 @@
+"""A text annotator based on LLM."""
+import json
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Iterator, List
+
+from oaklib.datamodels.text_annotator import TextAnnotation, TextAnnotationConfiguration
+from oaklib.interfaces import TextAnnotatorInterface
+from oaklib.interfaces.text_annotator_interface import TEXT, nen_annotation
+
+if TYPE_CHECKING:
+    import llm
+
+__all__ = [
+    "LLMImplementation",
+]
+
+
+@dataclass
+class LLMImplementation(TextAnnotatorInterface):
+    """Perform named entity normalization on LLM."""
+
+    grounder: TextAnnotatorInterface = None
+    """A wrapped annotator used to ground NEs.
+    """
+
+    model_id: str = None
+    """The ID of the LLM model to use. E.g gpt-4"""
+
+    model: "llm.Model" = None
+    """The LLM model to use."""
+
+    default_model_id: str = "gpt-3.5-turbo"
+
+    allow_direct_grounding: bool = False
+    """The point of this implementation is to perform NER and delegate to a grounded."""
+
+    max_recursion_depth: int = 0
+
+    def __post_init__(self):
+        slug = self.resource.slug
+        if not slug:
+            logging.warning("LLM implementation requires a slug for grounding")
+        else:
+            slug = slug.replace("llm:", "")
+            logging.info(f"LLM implementation will use grounder: {slug}")
+            from oaklib import get_adapter
+
+            self.grounder = get_adapter(slug)
+        if self.model_id is not None:
+            self.model = llm.get_model(self.model_id)
+
+    def annotate_text(
+        self, text: TEXT, configuration: TextAnnotationConfiguration = None
+    ) -> Iterator[TextAnnotation]:
+        if not configuration:
+            raise NotImplementedError("Missing text annotation configuration")
+        if configuration.matches_whole_text:
+            if not self.allow_direct_grounding:
+                raise NotImplementedError("LLM does not support whole-text matching")
+            else:
+                logging.info("Delegating directly to grounder, bypassing LLM")
+                yield from self.grounder.annotate_text(text, configuration)
+        else:
+            yield from self._llm_annotate(text, configuration)
+
+    def _llm_annotate(
+        self, text: str, configuration: TextAnnotationConfiguration = None, depth=0,
+    ) -> Iterator[TextAnnotation]:
+        system_prompt = self._system_prompt(configuration)
+        model = self.model
+        if not self.model:
+            model_id = configuration.model or self.model_id
+            if not model_id:
+                model_id = self.default_model_id
+            import llm
+
+            model = llm.get_model(model_id)
+        response = model.prompt(text, system=system_prompt)
+        logging.info(f"LLM response: {response}")
+        terms = json.loads(response.text())
+
+        grounder_configuration = TextAnnotationConfiguration(matches_whole_text=True)
+        while terms:
+            term_obj = terms.pop(0)
+            term = term_obj["term"]
+            category = term_obj["category"]
+            ann = TextAnnotation(subject_label=term, object_categories=[category])
+            matches = list(self.grounder.annotate_text(term, grounder_configuration))
+            if not matches:
+                aliases = self._suggest_aliases(term, model, configuration.categories, configuration)
+                for alias in aliases:
+                    matches = list(self.grounder.annotate_text(alias, grounder_configuration))
+                    if matches:
+                        break
+                logging.info(f"Aliases={aliases}; matches={matches}")
+                if not matches:
+                    if " " in term and depth < self.max_recursion_depth:
+                        logging.info(f"Recursing on {term}")
+                        anns = list(self._llm_annotate(term, configuration, depth + 1))
+                        logging.info(f"Results from recursion: on {term} => {anns}")
+                        if any(ann.object_id for ann in anns):
+                            for ann in anns:
+                                # TODO: offset
+                                ann.start = None
+                                ann.end = None
+                                yield ann
+                            continue
+            if matches:
+                ann.object_id = matches[0].object_id
+                ann.object_label = matches[0].object_label
+            else:
+                logging.info(f"LLM failed to ground {term} or its aliases")
+            if term in text:
+                ann.start = text.index(term)
+                ann.end = ann.start + len(term)
+            yield ann
+
+    def _system_prompt(self, configuration: TextAnnotationConfiguration = None) -> str:
+        categories = configuration.categories
+        prompt = "Perform named entity recognition on the text, returning a list of terms. "
+        prompt += "Terms can be compound containing multiple words. "
+        prompt += "Use noun phrases or terms representing entire concepts rather than multiple words. "
+        if configuration.sources:
+            prompt += f"Include terms that might be found in the following: {configuration.sources}. "
+        if categories:
+            prompt += f"Include only terms that are of type {categories}. "
+        prompt += """Return results as a JSON list: 
+                     [{"term:" "term1", "category": "category1"}, ... ]"""
+        return prompt
+
+    def _suggest_aliases(self, term: str, model: "llm.Model" = None, categories: List=None, configuration: TextAnnotationConfiguration = None) -> List[str]:
+        logging.info(f"LLM aliasing term: {term}")
+        prompt = "List exact synonyms for this term. "
+        prompt += "Normalize the string to a form found in an ontology. "
+        if configuration.sources:
+            prompt += f"Valid ontologies: {configuration.sources}. "
+        if categories:
+            prompt += f"Valid categories: {categories}. "
+        prompt += "You can split compound concepts into multiple terms."
+        prompt += "Return as a semi-colon separate list of terms. "
+        response = model.prompt(term, system=prompt).text()
+        logging.info(f"LLM aliases[{term}] => {response}")
+        return [x.strip() for x in response.split(";")]
+