diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 8798fc224..bad4a1c77 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -36,7 +36,6 @@ from prefixmaps.io.parser import load_multi_context from pydantic import BaseModel from sssom.parsers import parse_sssom_table, to_mapping_set_document -from tornado.gen import multi import oaklib.datamodels.taxon_constraints as tcdm from oaklib import datamodels diff --git a/src/oaklib/implementations/cx/cx_implementation.py b/src/oaklib/implementations/cx/cx_implementation.py index 0518331f9..0baeab0e5 100644 --- a/src/oaklib/implementations/cx/cx_implementation.py +++ b/src/oaklib/implementations/cx/cx_implementation.py @@ -73,5 +73,3 @@ def __post_init__(self): locator = path cx = ndex2.create_nice_cx_from_file(path) self.obograph_document = from_cx(cx) - - diff --git a/src/oaklib/implementations/llm_implementation.py b/src/oaklib/implementations/llm_implementation.py index 411d68080..2e089ab66 100644 --- a/src/oaklib/implementations/llm_implementation.py +++ b/src/oaklib/implementations/llm_implementation.py @@ -6,7 +6,7 @@ import re import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Tuple, Any +from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Tuple import pystow from linkml_runtime.dumpers import yaml_dumper @@ -236,7 +236,9 @@ def __post_init__(self): def _embeddings_collection_name(self) -> str: name = self.wrapped_adapter.resource.slug if not name: - raise ValueError(f"Wrapped adapter must have a slug: {self.wrapped_adapter} // {self.wrapped_adapter.resource}") + raise ValueError( + f"Wrapped adapter must have a slug: {self.wrapped_adapter} // {self.wrapped_adapter.resource}" + ) return name def entities(self, **kwargs) -> Iterator[CURIE]: @@ -281,7 +283,6 @@ def _parse_response(self, json_str: str) -> Any: json_str = json_str[4:].strip() return json.loads(json_str) - def get_model(self): model = self.model if not self.model: @@ -297,6 +298,7 @@ def get_model(self): def _embed_terms(self): import llm import sqlite_utils + adapter = self.wrapped_adapter name = self._embeddings_collection_name path_to_db = pystow.join("oaklib", "llm", "embeddings") @@ -308,14 +310,13 @@ def _embed_terms(self): def _term_embedding(self, id: CURIE) -> Optional[tuple]: import llm + db = self._embeddings_collection.db name = self._embeddings_collection_name collection_ids = list(db["collections"].rows_where("name = ?", (name,))) collection_id = collection_ids[0]["id"] matches = list( - db["embeddings"].rows_where( - "collection_id = ? and id = ?", (collection_id, id) - ) + db["embeddings"].rows_where("collection_id = ? and id = ?", (collection_id, id)) ) if not matches: logger.debug(f"ID not found: {id} in {collection_id} ({name})") @@ -324,18 +325,18 @@ def _term_embedding(self, id: CURIE) -> Optional[tuple]: comparison_vector = llm.decode(embedding) return comparison_vector - def pairwise_similarity( - self, - subject: CURIE, - object: CURIE, - predicates: List[PRED_CURIE] = None, - subject_ancestors: List[CURIE] = None, - object_ancestors: List[CURIE] = None, - min_jaccard_similarity: Optional[float] = None, - min_ancestor_information_content: Optional[float] = None, + self, + subject: CURIE, + object: CURIE, + predicates: List[PRED_CURIE] = None, + subject_ancestors: List[CURIE] = None, + object_ancestors: List[CURIE] = None, + min_jaccard_similarity: Optional[float] = None, + min_ancestor_information_content: Optional[float] = None, ) -> Optional[TermPairwiseSimilarity]: import llm + self._embed_terms() subject_embedding = self._term_embedding(subject) if not subject_embedding: @@ -351,7 +352,9 @@ def pairwise_similarity( ) return sim - def _ground_term(self, term: str, categories: Optional[List[str]] = None) -> Optional[Tuple[str, float]]: + def _ground_term( + self, term: str, categories: Optional[List[str]] = None + ) -> Optional[Tuple[str, float]]: matches = list(self._match_terms(term)) system = """ Given a list of ontology terms, find the one that best matches the given term. @@ -361,7 +364,7 @@ def _ground_term(self, term: str, categories: Optional[List[str]] = None) -> Opt - ANAT:002 pericardium Then a valid response is {"id": "ANAT:001", "confidence": 0.8}. """ - prompt = f"Find the best match for the term: \"{term}\".\n" + prompt = f'Find the best match for the term: "{term}".\n' if categories: if len(categories) == 1: prompt += f"Term Category: {categories[0]}.\n" @@ -401,7 +404,11 @@ def annotate_text( grounded, _confidence = self._ground_term(text, configuration.categories) logger.info(f"Grounded {text} to {grounded}") if grounded: - yield TextAnnotation(subject_label=text, object_id=grounded, object_label=self.wrapped_adapter.label(grounded)) + yield TextAnnotation( + subject_label=text, + object_id=grounded, + object_label=self.wrapped_adapter.label(grounded), + ) return else: logging.info("Delegating directly to grounder, bypassing LLM") @@ -495,9 +502,6 @@ def _match_terms(self, text: str) -> Iterator[Tuple[str, float]]: logger.debug(f"Similar: {entry}") yield entry.id, entry.score - - - def _suggest_aliases( self, term: str,