From 8746e90e1372c3bd611f7300af3e9877340df78f Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Thu, 29 Aug 2024 19:49:13 -0700 Subject: [PATCH] Adding a .sample operator (#797) See also #793 --- docs/howtos/use-oak-expression-language.rst | 63 ++++++++++++++++++++- src/oaklib/query.py | 28 +++++---- 2 files changed, 79 insertions(+), 12 deletions(-) diff --git a/docs/howtos/use-oak-expression-language.rst b/docs/howtos/use-oak-expression-language.rst index 161de725c..7a40d8bf0 100644 --- a/docs/howtos/use-oak-expression-language.rst +++ b/docs/howtos/use-oak-expression-language.rst @@ -289,6 +289,24 @@ FILTER The ``.filter`` operator allows you to provide arbitrary python filters. +QUERY +^^^^^ + +The ``.query`` operator allows you to pass through a query to the underlying store (SPARQL, SQL). + +For example, the ``sqlite`` backend uses SQL, so you can pass through SQL: + +.. code-block:: + + runoak -i sqlite:obo:uberon info .query \ + "SELECT subject from has_dbxref_statement where value like 'ZFA:%'" + +This is equivalent to: + +.. code-block:: + + runoak -i sqlite:obo:uberon info x^ZFA: + NR ^^ @@ -307,18 +325,59 @@ Example: runoak -i sqlite:obo:uberon info .mrca//p=i,p .idfile my_terms.txt +RAND +^^^^ + +Pick a random subset of terms. Parameterized by ``n`` (number of terms). + +Definitions for random terms in the Cell Ontology: + +.. code-block:: + + runoak -i sqlite:obo:cl definitions .rand + +For 10 random terms + +.. code-block:: + + runoak -i sqlite:obo:cl definitions .rand//n=10 + +.. note:: + + The ``.rand`` operator will sample from all terms in the ontology. This + could include terms imported and merged from other ontologies. For + finer-grained control, use the ``.sample`` operator, which allows the + combination of a sample operator with the results of evaluating any + OAK expression. + +SAMPLE +^^^^^^ + +The ``.sample`` operator takes a random sample of terms. It is parameterized by ``n`` (number of terms +in sample). + +Definitions for 3 random terms: + +.. code-block:: + + runoak -i sqlite:obo:obi definitions .sample//n=3 i^OBI: + +To compare 3 random terms with 3 other random terms: + +.. code-block:: + + runoak -i sqlite:obo:cl similarity .sample//n=3 i^CL: @ .sample//n=3 i^CL: + Others ^^^^^^ * ``.is_obsolete``: all :term:`Obsolete` terms * ``.non_obsolete``: all non-obsoletes * ``.dangling``: all :term:`Dangling` terms -* ``.query``: pass through a query to the underlying store (SPARQL, SQL) * ``.child``: non-transitive version of ``.desc``. Also parameterized by predicate. * ``.parent``: non-transitive version of ``.anc``. Also parameterized by predicate. * ``.sib``: all siblings of a term. Also parameterized by predicate. * ``.all``: all terms * ``.classes``: all classes * ``.relations``: all relations -* ``.rand``: random subset of terms. Parameters: ``n`` (number of terms) diff --git a/src/oaklib/query.py b/src/oaklib/query.py index 8506566d0..66c286273 100644 --- a/src/oaklib/query.py +++ b/src/oaklib/query.py @@ -595,10 +595,20 @@ def chain_results(v): logging.debug(f"Random: {term}") params = _parse_params(term) sample_size = params.get("n", "100") - entities = list(adapter.entities()) - sample = [ - entities[secrets.randbelow(len(entities))] for x in range(1, int(sample_size)) - ] + entities = set(adapter.entities()) + sample = secrets.SystemRandom().sample(entities, int(sample_size)) + chain_results(sample) + elif term.startswith(".sample"): + logging.debug(f"Sampling: {term}") + params = _parse_params(term) + sample_size = params.get("n", "100") + rest = list(query_terms_iterator([query_terms[0]], adapter)) + query_terms = query_terms[1:] + try: + sample = secrets.SystemRandom().sample(rest, int(sample_size)) + except ValueError as e: + logging.error(f"Error sampling {sample_size} / {len(rest)}: {e}") + raise e chain_results(sample) elif term.startswith(".in"): logging.debug(f"IN: {term}") @@ -648,19 +658,17 @@ def chain_results(v): this_predicates = params.get("predicates", predicates) rest = list(query_terms_iterator([query_terms[0]], adapter)) query_terms = query_terms[1:] - if isinstance(adapter, OboGraphInterface): - chain_results(adapter.descendants(rest, predicates=this_predicates)) - else: + if not isinstance(adapter, OboGraphInterface): raise NotImplementedError + chain_results(adapter.descendants(rest, predicates=this_predicates)) elif term.startswith(".sub"): logging.debug(f"Subclasses: {term}") # graph query: is-a descendants rest = list(query_terms_iterator([query_terms[0]], adapter)) query_terms = query_terms[1:] - if isinstance(adapter, OboGraphInterface): - chain_results(adapter.descendants(rest, predicates=[IS_A])) - else: + if not isinstance(adapter, OboGraphInterface): raise NotImplementedError + chain_results(adapter.descendants(rest, predicates=[IS_A])) elif term.startswith(".child"): logging.debug(f"Children: {term}") # graph query: children