Merge pull request #186 from haohangyan/queries

Endpoint Queries
gyorilab · Dec 2, 2024 · eb23c31 · eb23c31
2 parents 29a37fc + 2644028
commit eb23c31
Show file tree

Hide file tree

Showing 4 changed files with 236 additions and 23 deletions.
diff --git a/src/indra_cogex/apps/queries_web/__init__.py b/src/indra_cogex/apps/queries_web/__init__.py
@@ -63,12 +63,21 @@ def get_example_data():
     "parent": fields.List(fields.String, example=["MESH", "D007855"]),
     "mesh_term": fields.List(fields.String, example=["MESH", "D015002"]),
     "pmid_term": fields.List(fields.String, example=["PUBMED", "34634383"]),
-    "paper_term": fields.List(fields.String, example=["PUBMED", "34634383"]),
+    "paper_term": fields.List(fields.String, example=["PUBMED", "23356518"]),
     "pmids": fields.List(fields.String, example=["20861832", "19503834"]),
     "include_child_terms": fields.Boolean(example=True),
     # NOTE: statement hashes are too large to be int for JavaScript
     "stmt_hash": fields.String(example="12198579805553967"),
     "stmt_hashes": fields.List(fields.String, example=["12198579805553967", "30651649296901235"]),
+    "rel_type": fields.String(example="Phosphorylation"),
+    "rel_types": fields.List(fields.String, example=["Phosphorylation", "Activation"]),
+    "agent_name": fields.String(example="MEK"),
+    "agent": fields.String(example="MEK"),
+    "other_agent": fields.String(example="ERK"),
+    "agent_role": fields.String(example="Subject"),
+    "other_role" : fields.String(example="Object"),
+    "stmt_source": fields.String(example="reach"),
+    "stmt_sources": fields.List(fields.String, example=["reach", "sparser"]),
     "cell_line": fields.List(fields.String, example=["CCLE", "BT20_BREAST"]),
     "target": fields.List(fields.String, example=["HGNC", "6840"]),
     "targets": fields.List(
@@ -115,6 +124,7 @@ def get_example_data():
     "get_stmts_for_stmt_hashes": {"return_evidence_counts", "evidence_map"},
     "get_evidences_for_stmt_hash": {"remove_medscan"},
     "get_evidences_for_stmt_hashes": {"remove_medscan"},
+    "get_statements": {"mesh_term","include_child_terms"}
 }
 
 # This is the list of functions to be included

diff --git a/src/indra_cogex/client/queries.py b/src/indra_cogex/client/queries.py
@@ -9,7 +9,7 @@
 from indra.statements import Agent, Evidence, Statement
 
 from .neo4j_client import Neo4jClient, autoclient
-from ..representation import Node, Relation, indra_stmts_from_relations, norm_id
+from ..representation import Node, Relation, indra_stmts_from_relations, norm_id, generate_paper_clause
 
 logger = logging.getLogger(__name__)
 
@@ -45,6 +45,7 @@
     "get_stmts_for_mesh",
     "get_stmts_meta_for_stmt_hashes",
     "get_stmts_for_stmt_hashes",
+    "get_statements",
     "is_gene_mutated",
     "get_mutated_genes",
     "get_drugs_for_target",
@@ -780,7 +781,7 @@ def get_evidences_for_mesh(
         match_terms = {norm_mesh} | child_terms
         where_clause = "WHERE b.id IN $mesh_terms"
         single_mesh_match = ""
-        query_params["mesh_terms"] = match_terms
+        query_params["mesh_terms"] = list(match_terms)
     else:
         single_mesh_match = ' {id: $mesh_id}'
         where_clause = ""
@@ -929,30 +930,13 @@ def get_stmts_for_paper(
     # Todo: Add filters: e.g. belief cutoff, sources, db supported only,
     #  stmt type
 
-    if paper_term[0].lower() in {"pmid", "pubmed"}:
-        parameter = norm_id(*paper_term)
-        publication_props = "{id: $parameter}"
-
-    elif paper_term[0].lower() == "doi":
-        parameter = paper_term[1]
-        publication_props = "{doi: $parameter}"
-
-    elif paper_term[0].lower() in {"pmc", "pmcid"}:
-        parameter = paper_term[1]
-        publication_props = "{pmcid: $parameter}"
-
-    elif paper_term[0].lower() == "trid":
-        parameter = paper_term[1]
-        publication_props = "{trid: $parameter}"
-
-    else:
-        raise ValueError(f"Invalid prefix for publication lookup: {paper_term[0]}")
+    parameter, publication_props = generate_paper_clause(paper_term)
 
     hash_query = f"""\
         MATCH (e:Evidence)-[:has_citation]->(:Publication {publication_props})
         RETURN e.stmt_hash, e.evidence
     """
-    result = client.query_tx(hash_query, parameter=parameter)
+    result = client.query_tx(hash_query, paper_parameter=parameter)
     return _stmts_from_results(client=client, result=result, **kwargs)
 
 
@@ -1140,6 +1124,173 @@ def get_stmts_for_stmt_hashes(
     }
     return rv, evidence_counts
 
+@autoclient()
+def get_statements(
+        agent: Union[str, Tuple[str, str]],
+        *,
+        client: Neo4jClient,
+        rel_types: Optional[Union[str, List[str]]] = None,
+        stmt_sources: Optional[Union[str, List[str]]] = None,
+        agent_role: Optional[str] = None,
+        other_agent: Optional[Union[str, Tuple[str, str]]] = None,
+        other_role: Optional[str] = None,
+        paper_term: Optional[Tuple[str, str]] = None,
+        mesh_term: Optional[Tuple[str, str]] = None,
+        include_child_terms: Optional[bool] = True,
+        limit: Optional[int] = 10,
+        evidence_limit: Optional[int] = None,
+        return_evidence_counts: bool = False,
+) -> Union[List[Statement], Tuple[List[Statement], Mapping[int, int]]]:
+    """Return the statements based on optional constraints on relationship type and source(s).
+
+    Parameters
+    ----------
+    client : Neo4jClient
+        The Neo4j client used for executing the query.
+    rel_types : Optional[Union[str, List[str]]], default: None
+        The relationship type(s) to filter by, e.g., "Phosphorylation" or ["Phosphorylation", "Activation"].
+    stmt_sources : Optional[Union[str, List[str]]], default: None
+        The source(s) to filter by, e.g., "reach" or ["reach", "sparser"].
+    agent : Union[str, Tuple[str, str]]
+        The primary agent involved in the interaction. Can be specified as a name (e.g., "EGFR") or as a CURIE
+        tuple (namespace, ID), such as ("MESH", "D051379").
+    agent_role : Optional[str], default: None
+        The role of agent in the interaction: either "subject", "object", or None for an undirected search.
+    other_agent : Optional[Union[str, Tuple[str, str]]], default: None
+        A secondary agent in the interaction, specified either as a name or CURIE tuple.
+    other_role : Optional[str], default: None
+        The role of other_agent in the interaction: either "subject", "object", or None.
+    paper_term: Optional[Tuple[str, str]], default : None
+        The paper filter. Can be a PubMed ID, PMC id, TRID, or DOI
+    mesh_term : Optional[Tuple[str, str]], default : None
+        The mesh_term filter for evidences
+    include_child_terms : Optional[bool], default : True
+        If True, also match against the child MESH terms of the given MESH term.
+    limit : Optional[int], default: 10
+        The maximum number of statements to return.
+    evidence_limit : Optional[int], default: None
+        The optional maximum number of evidence entries to retrieve per statement.
+    return_evidence_counts : bool, default: False
+        Whether to include a mapping of statement hash to evidence count in the results.
+
+    Returns
+    -------
+    List[Statement]
+        A list of statements filtered by the provided constraints.
+    """
+    where_clauses = []
+    mesh_all_term, hash_in_rel = None, ""
+    if paper_term:
+        paper_param, paper_clause = generate_paper_clause(paper_term)
+    else:
+        paper_clause = None
+
+    if mesh_term or paper_term:
+        query = (f"MATCH (e:Evidence)-[:has_citation]->"
+                 f"(pub:Publication {paper_clause})-[:annotated_with] "
+                 f"-> (mesh_term:BioEntity)")
+        hash_in_rel = "{stmt_hash: e.stmt_hash}"
+        if mesh_term:
+            norm_mesh = norm_id(*mesh_term)
+            if include_child_terms:
+                child_terms = _get_mesh_child_terms(mesh_term, client=client)
+            else:
+                child_terms = set()
+            if child_terms:
+                mesh_all_term = {norm_mesh} | child_terms
+                mesh_all_term = list(mesh_all_term)
+            else:
+                mesh_all_term = [norm_mesh]
+            where_clauses.append("mesh_term.id IN $mesh_terms")
+    else:
+        query = ""
+
+    # Agent being CURIE
+    if isinstance(agent, tuple):
+        agent_constraint = norm_id(*agent)
+        agent_match_clause = f"(a:BioEntity {{id: $agent_constraint}})"
+    # Agent being text name
+    else:
+        agent_constraint = agent
+        agent_match_clause = f"(a:BioEntity {{name: $agent_constraint}})"
+
+    if isinstance(other_agent, tuple):
+        other_agent_constraint = norm_id(*other_agent)
+        other_agent_match_clause = f"(b:BioEntity {{id: $other_agent_constraint}})"
+    elif other_agent:
+        other_agent_constraint = other_agent
+        other_agent_match_clause = f"(b:BioEntity {{name: $other_agent_constraint}})"
+    else:
+        other_agent_match_clause = "(b:BioEntity)"
+
+    if agent_role == "subject" and other_role == "object":
+        match_clause = f"{agent_match_clause}-[r:indra_rel {hash_in_rel}]->{other_agent_match_clause}"
+    elif agent_role == "object" and other_role == "subject":
+        match_clause = f"{other_agent_match_clause}-[r:indra_rel {hash_in_rel}]->{agent_match_clause}"
+    elif agent_role == "subject":
+        match_clause = f"{agent_match_clause}-[r:indra_rel {hash_in_rel}]->{other_agent_match_clause}"
+    elif agent_role == "object":
+        match_clause = f"{other_agent_match_clause}-[r:indra_rel {hash_in_rel}]->{agent_match_clause}"
+    else:
+        match_clause = f"{agent_match_clause}-[r:indra_rel {hash_in_rel}]-{other_agent_match_clause}"
+
+    if rel_types:
+        if isinstance(rel_types, str):
+            rel_types = [rel_types]
+        where_clauses.append("r.stmt_type IN $rel_types")
+
+
+    if stmt_sources:
+        if isinstance(stmt_sources, str):
+            stmt_sources = [stmt_sources]
+        where_clauses.append("any(source IN $stmt_sources WHERE r.source_counts CONTAINS source)")
+
+    if where_clauses:
+        match_clause += " WHERE " + " AND ".join(where_clauses)
+
+    query += f"MATCH p = {match_clause} WITH distinct r.stmt_hash AS hash, collect(p) as pp RETURN pp LIMIT $limit"
+    params = {
+        "agent_constraint": agent_constraint,
+        "rel_types": rel_types if isinstance(rel_types, list) else [rel_types],
+        "limit": limit
+    }
+    if other_agent:
+        params["other_agent_constraint"] = other_agent_constraint
+    if mesh_all_term:
+        params["mesh_terms"] = mesh_all_term
+    if stmt_sources:
+        params['stmt_sources'] = stmt_sources
+    if paper_term:
+        params['paper_parameter'] = paper_param
+
+
+    logger.info(f"Running query with constraints: rel_type={rel_types}, "
+                f"source={stmt_sources}, agent={agent}, other_agent={other_agent}, "
+                f"agent_role={agent_role}, other_role={other_role}, limit={limit}")
+    logger.info(query)
+    rels = client.query_tx(query, **params)
+    flattened_rels = [client.neo4j_to_relation(i[0]) for rel in rels for i in rel]
+    stmts = indra_stmts_from_relations(flattened_rels, deduplicate=True)
+    if evidence_limit and evidence_limit > 1:
+        stmts = enrich_statements(
+            stmts,
+            client=client,
+            evidence_limit=evidence_limit,
+        )
+
+    if not return_evidence_counts:
+        return stmts
+
+    evidence_counts = {
+        stmt.get_hash(): (
+            min(rel.data["evidence_count"], evidence_limit)
+            if evidence_limit is not None
+            else rel.data["evidence_count"]
+        )
+        for rel, stmt in zip(flattened_rels, stmts)
+    }
+
+    return stmts, evidence_counts
 
 @autoclient()
 def enrich_statements(

diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py
@@ -3,7 +3,7 @@
 """Representations for nodes and relations to upload to Neo4j."""
 
 
-__all__ = ["Node", "Relation", "indra_stmts_from_relations", "norm_id"]
+__all__ = ["Node", "Relation", "indra_stmts_from_relations", "norm_id", "generate_paper_clause"]
 
 import codecs
 from typing import (
@@ -505,3 +505,31 @@ def indra_stmts_from_relations(rels: Iterable[Relation],
         # We do it this way to not change the order of the statements
         stmts = list({stmt.get_hash(): stmt for stmt in stmts}.values())
     return stmts
+
+
+def generate_paper_clause(paper_term: Tuple[str, str]):
+    """
+    paper_term :
+        Can be a PubMed ID, PMC id, TRID, or DOI
+    """
+    if paper_term[0].lower() in {"pmid", "pubmed"}:
+        parameter = norm_id(*paper_term)
+        publication_props = "{id: $paper_parameter}"
+
+    elif paper_term[0].lower() == "doi":
+        parameter = paper_term[1]
+        publication_props = "{doi: $paper_parameter}"
+
+    elif paper_term[0].lower() in {"pmc", "pmcid"}:
+        parameter = paper_term[1]
+        publication_props = "{pmcid: $paper_parameter}"
+
+    elif paper_term[0].lower() == "trid":
+        parameter = paper_term[1]
+        publication_props = "{trid: $paper_parameter}"
+
+    else:
+        raise ValueError(
+            f"Invalid prefix for publication lookup: {paper_term[0]}")
+
+    return parameter, publication_props
diff --git a/tests/test_queries.py b/tests/test_queries.py
@@ -368,6 +368,30 @@ def test_get_stmts_by_hashes():
     assert stmts
     assert isinstance(stmts[0], Statement)
 
+@pytest.mark.nonpublic
+def test_get_statements():
+    client = _get_client()
+    stmts, evidence_map = get_statements(
+        agent="MEK",
+        other_agent="ERK",
+        agent_role='subject',
+        other_role='object',
+        rel_types=["Phosphorylation", "Activation"],
+        stmt_sources='reach',
+        mesh_term=("MESH", 'D000818'),
+        paper_term=('pubmed', '23356518'),
+        client=client,
+        limit=1000,
+        evidence_limit=500,
+        return_evidence_counts=True
+    )
+    assert stmts
+    assert all(isinstance(stmt, Statement) for stmt in stmts)
+    assert evidence_map
+    stmt_hashes = {stmt.get_hash() for stmt in stmts}
+    assert stmt_hashes == evidence_map.keys()
+    for stmt_hash, evidence_count in evidence_map.items():
+        assert evidence_count <= 500
 
 @pytest.mark.nonpublic
 def test_is_gene_mutated():