Skip to content

Commit

Permalink
Adding llm as an extra, adding llm implementation for annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
cmungall committed Sep 22, 2023
1 parent 454360c commit 19c60e3
Show file tree
Hide file tree
Showing 6 changed files with 1,235 additions and 480 deletions.
1,508 changes: 1,033 additions & 475 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ kgcl-schema = "0.6.0"
funowl = ">=0.2.0"
gilda = {version = ">=1.0.0", optional = true}
kgcl-rdflib = "0.5.0"
llm = {version = "*", optional = true}
pystow = ">=0.5.0"
class-resolver = ">=0.4.2"
ontoportal-client = ">=0.0.3"
Expand Down Expand Up @@ -68,6 +69,7 @@ boomerang = "oaklib.utilities.mapping.boomer_utils:main"
[tool.poetry.extras]
docs = ["Sphinx", "sphinx-rtd-theme", "sphinxcontrib-mermaid", "sphinx-copybutton"]
gilda = ["scipy", "gilda", "urllib3"]
llm = ["llm"]
seaborn = ["seaborn"]

[tool.black]
Expand Down
40 changes: 35 additions & 5 deletions src/oaklib/datamodels/text_annotator.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
# Auto generated from text_annotator.yaml by pythongen.py version: 0.9.0
# Generation date: 2023-04-09T15:53:59
# Auto generated from text_annotator.yaml by pythongen.py version: 0.0.1
# Generation date: 2023-09-16T18:49:46
# Schema: text-annotator
#
# id: https://w3id.org/oak/text_annotator
# description: A datamodel for representing the results of textual named entity recognition annotation results.
# This draws upon both SSSOM and https://www.w3.org/TR/annotation-model/
# description: A datamodel for representing the results of textual named entity recognition annotation results. This draws upon both SSSOM and https://www.w3.org/TR/annotation-model/
# license: https://creativecommons.org/publicdomain/zero/1.0/

import dataclasses
import re
import sys
from dataclasses import dataclass
from typing import Any, ClassVar, Dict, List, Optional, Union

Expand Down Expand Up @@ -102,6 +100,7 @@ class TextAnnotationConfiguration(YAMLRoot):
sources: Optional[Union[str, List[str]]] = empty_list()
limit: Optional[int] = None
token_exclusion_list: Optional[Union[str, List[str]]] = empty_list()
categories: Optional[Union[str, List[str]]] = empty_list()
model: Optional[str] = None
include_aliases: Optional[Union[bool, Bool]] = None

Expand All @@ -124,6 +123,10 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
v if isinstance(v, str) else str(v) for v in self.token_exclusion_list
]

if not isinstance(self.categories, list):
self.categories = [self.categories] if self.categories is not None else []
self.categories = [v if isinstance(v, str) else str(v) for v in self.categories]

if self.model is not None and not isinstance(self.model, str):
self.model = str(self.model)

Expand Down Expand Up @@ -245,6 +248,7 @@ class TextAnnotation(YAMLRoot):
predicate_id: Optional[str] = None
object_id: Optional[str] = None
object_label: Optional[str] = None
object_categories: Optional[Union[str, List[str]]] = empty_list()
object_source: Optional[str] = None
confidence: Optional[float] = None
match_string: Optional[str] = None
Expand All @@ -269,6 +273,14 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.object_label is not None and not isinstance(self.object_label, str):
self.object_label = str(self.object_label)

if not isinstance(self.object_categories, list):
self.object_categories = (
[self.object_categories] if self.object_categories is not None else []
)
self.object_categories = [
v if isinstance(v, str) else str(v) for v in self.object_categories
]

if self.object_source is not None and not isinstance(self.object_source, str):
self.object_source = str(self.object_source)

Expand Down Expand Up @@ -395,6 +407,15 @@ class slots:
range=Optional[Union[str, List[str]]],
)

slots.textAnnotationConfiguration__categories = Slot(
uri=ANN.categories,
name="textAnnotationConfiguration__categories",
curie=ANN.curie("categories"),
model_uri=ANN.textAnnotationConfiguration__categories,
domain=None,
range=Optional[Union[str, List[str]]],
)

slots.textAnnotationConfiguration__model = Slot(
uri=ANN.model,
name="textAnnotationConfiguration__model",
Expand Down Expand Up @@ -530,6 +551,15 @@ class slots:
range=Optional[str],
)

slots.textAnnotation__object_categories = Slot(
uri=ANN.object_categories,
name="textAnnotation__object_categories",
curie=ANN.curie("object_categories"),
model_uri=ANN.textAnnotation__object_categories,
domain=None,
range=Optional[Union[str, List[str]]],
)

slots.textAnnotation__object_source = Slot(
uri=SSSOM.object_source,
name="textAnnotation__object_source",
Expand Down
18 changes: 18 additions & 0 deletions src/oaklib/datamodels/text_annotator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,33 @@ classes:
description: configuration for search
attributes:
matches_whole_text:
aliases:
- grounding_mode
range: boolean
description: >-
If true, then only grounding is performed, and the entire text is used as the match string.
sources:
multivalued: true
limit:
range: integer
description: >-
The maximum number of annotations to return
token_exclusion_list:
multivalued: true
description: >-
A list of tokens to exclude from the annotation process
categories:
multivalued: true
description: >-
A list of named entity categories to include.
model:
range: string
description: >-
The name of the model to use for annotation. The specifics of this are implementation-dependent.
include_aliases:
range: boolean
description: >-
If true, then the aliases (synonyms) of the matched entity are included in the annotation results.
TextAnnotationResultSet:
description: A collection of annotation results
Expand Down Expand Up @@ -119,6 +135,8 @@ classes:
slot_uri: sssom:object_label
exact_mappings:
- bpa:annotatedClass.prefLabel
object_categories:
multivalued: true
object_source:
slot_uri: sssom:object_source
confidence:
Expand Down
2 changes: 2 additions & 0 deletions src/oaklib/implementations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from oaklib.implementations.funowl.funowl_implementation import FunOwlImplementation
from oaklib.implementations.gilda import GildaImplementation
from oaklib.implementations.kgx.kgx_implementation import KGXImplementation
from oaklib.implementations.llm_implementation import LLMImplementation
from oaklib.implementations.monarch.monarch_implementation import MonarchImplementation
from oaklib.implementations.ols import (
BaseOlsImplementation,
Expand Down Expand Up @@ -87,6 +88,7 @@
"PubMedImplementation",
"FunOwlImplementation",
"GildaImplementation",
"LLMImplementation",
"KGXImplementation",
"UniprotImplementation",
"TranslatorImplementation",
Expand Down
145 changes: 145 additions & 0 deletions src/oaklib/implementations/llm_implementation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""A text annotator based on LLM."""
import json
import logging
from dataclasses import dataclass
from typing import TYPE_CHECKING, Iterator, List

from oaklib.datamodels.text_annotator import TextAnnotation, TextAnnotationConfiguration
from oaklib.interfaces import TextAnnotatorInterface
from oaklib.interfaces.text_annotator_interface import TEXT, nen_annotation

if TYPE_CHECKING:
import llm

__all__ = [
"LLMImplementation",
]


@dataclass
class LLMImplementation(TextAnnotatorInterface):
"""Perform named entity normalization on LLM."""

grounder: TextAnnotatorInterface = None
"""A wrapped annotator used to ground NEs.
"""

model_id: str = None
"""The ID of the LLM model to use. E.g gpt-4"""

model: "llm.Model" = None
"""The LLM model to use."""

default_model_id: str = "gpt-3.5-turbo"

allow_direct_grounding: bool = False
"""The point of this implementation is to perform NER and delegate to a grounded."""

max_recursion_depth: int = 0

def __post_init__(self):
slug = self.resource.slug
if not slug:
logging.warning("LLM implementation requires a slug for grounding")
else:
slug = slug.replace("llm:", "")
logging.info(f"LLM implementation will use grounder: {slug}")
from oaklib import get_adapter

self.grounder = get_adapter(slug)
if self.model_id is not None:
self.model = llm.get_model(self.model_id)

def annotate_text(
self, text: TEXT, configuration: TextAnnotationConfiguration = None
) -> Iterator[TextAnnotation]:
if not configuration:
raise NotImplementedError("Missing text annotation configuration")
if configuration.matches_whole_text:
if not self.allow_direct_grounding:
raise NotImplementedError("LLM does not support whole-text matching")
else:
logging.info("Delegating directly to grounder, bypassing LLM")
yield from self.grounder.annotate_text(text, configuration)
else:
yield from self._llm_annotate(text, configuration)

def _llm_annotate(
self, text: str, configuration: TextAnnotationConfiguration = None, depth=0,
) -> Iterator[TextAnnotation]:
system_prompt = self._system_prompt(configuration)
model = self.model
if not self.model:
model_id = configuration.model or self.model_id
if not model_id:
model_id = self.default_model_id
import llm

model = llm.get_model(model_id)
response = model.prompt(text, system=system_prompt)
logging.info(f"LLM response: {response}")
terms = json.loads(response.text())

grounder_configuration = TextAnnotationConfiguration(matches_whole_text=True)
while terms:
term_obj = terms.pop(0)
term = term_obj["term"]
category = term_obj["category"]
ann = TextAnnotation(subject_label=term, object_categories=[category])
matches = list(self.grounder.annotate_text(term, grounder_configuration))
if not matches:
aliases = self._suggest_aliases(term, model, configuration.categories, configuration)
for alias in aliases:
matches = list(self.grounder.annotate_text(alias, grounder_configuration))
if matches:
break
logging.info(f"Aliases={aliases}; matches={matches}")
if not matches:
if " " in term and depth < self.max_recursion_depth:
logging.info(f"Recursing on {term}")
anns = list(self._llm_annotate(term, configuration, depth + 1))
logging.info(f"Results from recursion: on {term} => {anns}")
if any(ann.object_id for ann in anns):
for ann in anns:
# TODO: offset
ann.start = None
ann.end = None
yield ann
continue
if matches:
ann.object_id = matches[0].object_id
ann.object_label = matches[0].object_label
else:
logging.info(f"LLM failed to ground {term} or its aliases")
if term in text:
ann.start = text.index(term)
ann.end = ann.start + len(term)
yield ann

def _system_prompt(self, configuration: TextAnnotationConfiguration = None) -> str:
categories = configuration.categories
prompt = "Perform named entity recognition on the text, returning a list of terms. "
prompt += "Terms can be compound containing multiple words. "
prompt += "Use noun phrases or terms representing entire concepts rather than multiple words. "
if configuration.sources:
prompt += f"Include terms that might be found in the following: {configuration.sources}. "
if categories:
prompt += f"Include only terms that are of type {categories}. "
prompt += """Return results as a JSON list:
[{"term:" "term1", "category": "category1"}, ... ]"""
return prompt

def _suggest_aliases(self, term: str, model: "llm.Model" = None, categories: List=None, configuration: TextAnnotationConfiguration = None) -> List[str]:
logging.info(f"LLM aliasing term: {term}")
prompt = "List exact synonyms for this term. "
prompt += "Normalize the string to a form found in an ontology. "
if configuration.sources:
prompt += f"Valid ontologies: {configuration.sources}. "
if categories:
prompt += f"Valid categories: {categories}. "
prompt += "You can split compound concepts into multiple terms."
prompt += "Return as a semi-colon separate list of terms. "
response = model.prompt(term, system=prompt).text()
logging.info(f"LLM aliases[{term}] => {response}")
return [x.strip() for x in response.split(";")]

0 comments on commit 19c60e3

Please sign in to comment.