Skip to content

Commit

Permalink
Extending amigo and llm implementations (#714)
Browse files Browse the repository at this point in the history
* Additional LLM documentation

* Additional LLM implementations

* Adding association_counts

* fixed typo

* Additional test
  • Loading branch information
cmungall authored Mar 14, 2024
1 parent 10ccb21 commit e76ee10
Show file tree
Hide file tree
Showing 11 changed files with 1,020 additions and 61 deletions.
2 changes: 2 additions & 0 deletions docs/howtos/use-llms.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.. _use_llms
How to use Large Language Models (LLMs) with OAK
===============================================

Expand Down
17 changes: 16 additions & 1 deletion docs/packages/implementations/llm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
LLM Adapter
=============

See also :ref:`use_llms`

Command Line Examples
----------------------

Expand All @@ -14,12 +16,25 @@ Use the :code:`llm` selector, wrapping an existing source
Annotation
^^^^^^^^^^

.. code:: shell
runoak -i llm:sqlite:obo:hp annotate "abnormalities were found in the eye and the liver"
Validation of Mappings
^^^^^^^^^^^^^^^^^^^^^^

.. code:: shell
runoak -i llm:sqlite:obo:go validate-mappings .desc//p=i "molecular_function"
See:

- `MapperGPT <https://arxiv.org/abs/2310.03666>`_
- `GO RHEA Analysis <https://github.com/cmungall/rhea-go-llm-analysis>`_

Code
----
.. currentmodule:: oaklib.implementations.llm
.. currentmodule:: oaklib.implementations.llm.llm_implementation

.. autoclass:: LLMImplementation
373 changes: 373 additions & 0 deletions notebooks/GO/Term-Usage-Summary.ipynb

Large diffs are not rendered by default.

216 changes: 197 additions & 19 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,9 @@ def _nest_list_of_terms(terms: List[str]) -> Tuple[NESTED_LIST, List[str]]:
return nested, []


def curies_from_file(file: IO) -> Iterator[CURIE]:
def curies_from_file(
file: IO, adapter: Optional[BasicOntologyInterface] = None, allow_labels=False, strict=False
) -> Iterator[CURIE]:
"""
yield an iterator over CURIEs by parsing a file.
Expand All @@ -596,16 +598,29 @@ def curies_from_file(file: IO) -> Iterator[CURIE]:
is ignored
:param file:
:param adapter: if provided, will be used to resolve CURIEs
:param allow_labels: if true, will allow inputs to be labels
:param strict: if true, will raise an error if a CURIE cannot be resolved
:return:
"""
line_no = 0
if allow_labels and not adapter:
raise ValueError("Must provide an adapter to resolve labels")
for line in file.readlines():
line_no += 1
m = re.match(r"^(\S+)", line)
curie = m.group(1)
if curie == "id" and line_no == 1:
continue
yield curie
if ":" in line or not allow_labels:
m = re.match(r"^(\S+)", line)
curie = m.group(1)
if curie == "id" and line_no == 1:
continue
yield curie
elif allow_labels:
candidates = adapter.curies_by_label(line.strip())
if strict and len(candidates) != 1:
raise ValueError(
f"Could not resolve label {line} to a single CURIE, got {candidates}"
)
yield from candidates


def query_terms_iterator(query_terms: NESTED_LIST, impl: BasicOntologyInterface) -> Iterator[CURIE]:
Expand Down Expand Up @@ -2755,21 +2770,23 @@ def similarity_pair(terms, predicates, autolabel: bool, output: TextIO, output_t
"""
if len(terms) != 2:
raise ValueError(f"Need exactly 2 terms: {terms}")
subject = terms[0]
object = terms[1]
impl = settings.impl
writer = _get_writer(output_type, impl, StreamingYamlWriter, datamodels.similarity)
writer.output = output
if isinstance(impl, SemanticSimilarityInterface):
actual_predicates = _process_predicates_arg(predicates)
sim = impl.pairwise_similarity(subject, object, predicates=actual_predicates)
if autolabel:
subject = list(query_terms_iterator([terms[0]], impl))[0]
object = list(query_terms_iterator([terms[1]], impl))[0]
if not isinstance(impl, SemanticSimilarityInterface):
raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}")
actual_predicates = _process_predicates_arg(predicates)
sim = impl.pairwise_similarity(subject, object, predicates=actual_predicates)
if autolabel:
if not sim.subject_label:
sim.subject_label = impl.label(sim.subject_id)
if not sim.object_label:
sim.object_label = impl.label(sim.object_id)
if not sim.ancestor_label:
sim.ancestor_label = impl.label(sim.ancestor_id)
writer.emit(sim)
else:
raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}")
writer.emit(sim)
writer.finish()


Expand Down Expand Up @@ -4532,6 +4549,145 @@ def associations(
_apply_changes(impl, changes)


@main.command()
@output_option
@predicates_option
@autolabel_option
@output_type_option
@output_option
@click.option(
"--add-closure-fields/--no-add-closure-fields",
default=False,
show_default=True,
help="Add closure fields to the output",
)
@click.option(
"--association-predicates",
help="A comma-separated list of predicates for the association relation",
)
@click.option(
"--terms-role",
"-Q",
type=click.Choice([x.value for x in SubjectOrObjectRole]),
default=SubjectOrObjectRole.OBJECT.value,
show_default=True,
help="How to interpret query terms.",
)
@click.option(
"--limit",
"-L",
default=10,
show_default=True,
help="Limit the number of results",
)
@click.option(
"--filter",
"-F",
multiple=True,
help="Additional filters in K=V format",
)
@click.option(
"--min-facet-count",
default=1,
show_default=True,
help="Minimum count for a facet to be included",
)
@click.option(
"--group-by",
default="object",
show_default=True,
help="Group by subject or object",
)
@click.argument("terms", nargs=-1)
def associations_counts(
terms,
predicates: str,
association_predicates: str,
terms_role: str,
autolabel: bool,
output_type: str,
output: str,
filter,
**kwargs,
):
"""
Count associations, grouped by subject or object
Example:
runoak -i sqlite:obo:hp -g test.hpoa -G hpoa associations-counts
This will default to summarzing by objects (HPO term), showing the number
of associations for each term.
This will be direct counts only. To include is-a closure, specify
the closure predicate(s), e.g.
Example:
runoak -i sqlite:obo:hp -g test.hpoa -G hpoa associations -p i
You can also group by other fields
Example:
runoak -i sqlite:obo:hp -g test.hpoa -G hpoa associations-counts --group-by subject
This will show the number of associations for each disease.
OAK also includes a number of specialized adapters that implement this method
for particular databases.
For example, to get the number of IEA associations for each GO term:
runoak -i amigo: associations-counts --limit -1 -F evidence_type=IEA --no-autolabel
This can be constrained by species:
runoak -i amigo:NCBITaxon:9606 associations-counts --limit -1 -F evidence_type=IEA --no-autolabel
Other options:
This command accepts many of the same options as the associations command, see
the docs for this command for details.
"""
impl = settings.impl
writer = _get_writer(output_type, impl, StreamingCsvWriter)
writer.autolabel = autolabel
writer.output = output
actual_predicates = _process_predicates_arg(predicates)
actual_association_predicates = _process_predicates_arg(association_predicates)
if not isinstance(impl, AssociationProviderInterface):
raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}")
curies = list(query_terms_iterator(terms, impl))
filter_dict = {k: v for k, v in (x.split("=") for x in filter)}
kwargs["property_filter"] = filter_dict
qs_it = impl.association_counts(
curies,
predicates=actual_association_predicates,
subject_closure_predicates=actual_predicates,
**kwargs,
)
qo_it = impl.association_counts(
objects=curies,
predicates=actual_association_predicates,
object_closure_predicates=actual_predicates,
**kwargs,
)
if terms_role is None or terms_role == SubjectOrObjectRole.SUBJECT.value:
it = qs_it
elif terms_role == SubjectOrObjectRole.OBJECT.value:
it = qo_it
else:
logging.info("Using query terms to query both subject and object")
it = chain(qs_it, qo_it)
for term, count in it:
writer.emit(
{"term": term, "count": count},
label_fields=["term"],
)


@main.command()
@output_option
@predicates_option
Expand Down Expand Up @@ -4757,6 +4913,11 @@ def apply_labels(group):
default=False,
help="If true, filter out redundant terms",
)
@click.option(
"--allow-labels/--no-allow-labels",
default=False,
help="If true, allow labels as well as CURIEs in the input files",
)
@click.argument("terms", nargs=-1)
def enrichment(
terms,
Expand All @@ -4769,6 +4930,7 @@ def enrichment(
sample_file: TextIO,
background_file: TextIO,
ontology_only: bool,
allow_labels: bool,
**kwargs,
):
"""
Expand Down Expand Up @@ -4803,8 +4965,12 @@ def enrichment(
impl = settings.impl
actual_predicates = _process_predicates_arg(predicates)
actual_association_predicates = _process_predicates_arg(association_predicates)
subjects = list(curies_from_file(sample_file))
background = list(curies_from_file(background_file)) if background_file else None
subjects = list(curies_from_file(sample_file, adapter=impl, allow_labels=allow_labels))
background = (
list(curies_from_file(background_file, adapter=impl, allow_labels=allow_labels))
if background_file
else None
)
if not isinstance(impl, ClassEnrichmentCalculationInterface):
raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}")
if (
Expand Down Expand Up @@ -5181,17 +5347,29 @@ def validate_mappings(
runoak validate-mappings -i db/uberon.db -o bad-mappings.sssom.tsv
By default this will attempt to download and connect to
sqlite versions of different ontologies.
sqlite versions of different ontologies, when attempting to resolve a foreign
subject or object id.
You can customize this:
You can customize this mapping:
runoak validate-mappings -i db/uberon.db --adapter-mapping uberon=db/uberon.db \
--adapter-mapping zfa=db/zfa.db
This will use a local sqlite file for ZFA:nnnnnnn IDs.
You can use "*" as a wildcard, in the case where you have an application ontology
with many mapped entities merged in:
runoak validate-mappings -i db/uberon.db --adapter-mapping "*"=db/merged.db"
The default behavior for this command is to perform deterministic rule-based
checks; for example, the mapped IDs should not be obsolete, and if the mapping
is skos:exactMatch, then the cardinality is expected to be 1:1.
Other adapters may choose to implement bespoke behaviors. In future there
might be a boomer adapter that will perform probabilistic reasoning on the
mappings. The experimental LLM backend will use an LLM to qualitatively
validate mappings (see the LLM how-to guide for more details).
"""
impl = settings.impl
writer = _get_writer(output_type, impl, StreamingCsvWriter)
Expand Down
Loading

0 comments on commit e76ee10

Please sign in to comment.