Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Endpoint Queries #186

Merged
merged 22 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
169d2b6
REST API for getting statements
haohangyan Oct 28, 2024
ad8b71b
More parameters in examples_dict
haohangyan Oct 28, 2024
dfd6304
Changed API calls return values and queies logic
haohangyan Oct 28, 2024
e6d226d
Remove debug print
haohangyan Oct 28, 2024
60d0432
Fixed get_statement_mix query related to agent type
haohangyan Oct 30, 2024
dc9e48a
Add desccriptions for some parameters
haohangyan Oct 31, 2024
9adb8b6
Fixed evidence enrichment condition
haohangyan Oct 31, 2024
3b597dc
Agent type can be either object or subject in get_stmts_for_agent_type
haohangyan Oct 31, 2024
b90408c
Agent type can be either object or subject in get_statements_mix
haohangyan Oct 31, 2024
8053769
Queries allow multiple relationship types
haohangyan Nov 1, 2024
65efb73
Queries allow multiple source types
haohangyan Nov 1, 2024
d25eab3
Return evidence count if needed
haohangyan Nov 1, 2024
e0a2dba
Optional agent_role in queries
haohangyan Nov 4, 2024
b46097b
Change get_statements_mix to get_statements
haohangyan Nov 4, 2024
0f9af16
Allow two agents to be either text or CURIE in get_statements()
haohangyan Nov 4, 2024
da2da41
Mesh filter for get_statement()
haohangyan Nov 6, 2024
013ef1f
Mesh constraint and returning collected p
haohangyan Nov 12, 2024
0bfe3c2
Paper constraint added and re-organized the code
haohangyan Nov 12, 2024
f8b2cf6
Add test_get_statements
haohangyan Nov 12, 2024
6a326f4
Deal with evidence_limit being None
haohangyan Nov 13, 2024
e68529f
Changed the client parameter position
haohangyan Nov 14, 2024
2644028
Removed get_stmts_for_agent_type get_stmts_for_source get_stmts_for_r…
haohangyan Nov 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/indra_cogex/apps/queries_web/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,21 @@ def get_example_data():
"parent": fields.List(fields.String, example=["MESH", "D007855"]),
"mesh_term": fields.List(fields.String, example=["MESH", "D015002"]),
"pmid_term": fields.List(fields.String, example=["PUBMED", "34634383"]),
"paper_term": fields.List(fields.String, example=["PUBMED", "34634383"]),
"paper_term": fields.List(fields.String, example=["PUBMED", "23356518"]),
"pmids": fields.List(fields.String, example=["20861832", "19503834"]),
"include_child_terms": fields.Boolean(example=True),
# NOTE: statement hashes are too large to be int for JavaScript
"stmt_hash": fields.String(example="12198579805553967"),
"stmt_hashes": fields.List(fields.String, example=["12198579805553967", "30651649296901235"]),
"rel_type": fields.String(example="Phosphorylation"),
"rel_types": fields.List(fields.String, example=["Phosphorylation", "Activation"]),
"agent_name": fields.String(example="MEK"),
"agent": fields.String(example="MEK"),
"other_agent": fields.String(example="ERK"),
"agent_role": fields.String(example="Subject"),
"other_role" : fields.String(example="Object"),
"stmt_source": fields.String(example="reach"),
"stmt_sources": fields.List(fields.String, example=["reach", "sparser"]),
"cell_line": fields.List(fields.String, example=["CCLE", "BT20_BREAST"]),
"target": fields.List(fields.String, example=["HGNC", "6840"]),
"targets": fields.List(
Expand Down Expand Up @@ -115,6 +124,7 @@ def get_example_data():
"get_stmts_for_stmt_hashes": {"return_evidence_counts", "evidence_map"},
"get_evidences_for_stmt_hash": {"remove_medscan"},
"get_evidences_for_stmt_hashes": {"remove_medscan"},
"get_statements": {"mesh_term","include_child_terms"}
}

# This is the list of functions to be included
Expand Down
193 changes: 172 additions & 21 deletions src/indra_cogex/client/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from indra.statements import Agent, Evidence, Statement

from .neo4j_client import Neo4jClient, autoclient
from ..representation import Node, Relation, indra_stmts_from_relations, norm_id
from ..representation import Node, Relation, indra_stmts_from_relations, norm_id, generate_paper_clause

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -45,6 +45,7 @@
"get_stmts_for_mesh",
"get_stmts_meta_for_stmt_hashes",
"get_stmts_for_stmt_hashes",
"get_statements",
"is_gene_mutated",
"get_mutated_genes",
"get_drugs_for_target",
Expand Down Expand Up @@ -780,7 +781,7 @@ def get_evidences_for_mesh(
match_terms = {norm_mesh} | child_terms
where_clause = "WHERE b.id IN $mesh_terms"
single_mesh_match = ""
query_params["mesh_terms"] = match_terms
query_params["mesh_terms"] = list(match_terms)
else:
single_mesh_match = ' {id: $mesh_id}'
where_clause = ""
Expand Down Expand Up @@ -929,30 +930,13 @@ def get_stmts_for_paper(
# Todo: Add filters: e.g. belief cutoff, sources, db supported only,
# stmt type

if paper_term[0].lower() in {"pmid", "pubmed"}:
parameter = norm_id(*paper_term)
publication_props = "{id: $parameter}"

elif paper_term[0].lower() == "doi":
parameter = paper_term[1]
publication_props = "{doi: $parameter}"

elif paper_term[0].lower() in {"pmc", "pmcid"}:
parameter = paper_term[1]
publication_props = "{pmcid: $parameter}"

elif paper_term[0].lower() == "trid":
parameter = paper_term[1]
publication_props = "{trid: $parameter}"

else:
raise ValueError(f"Invalid prefix for publication lookup: {paper_term[0]}")
parameter, publication_props = generate_paper_clause(paper_term)

hash_query = f"""\
MATCH (e:Evidence)-[:has_citation]->(:Publication {publication_props})
RETURN e.stmt_hash, e.evidence
"""
result = client.query_tx(hash_query, parameter=parameter)
result = client.query_tx(hash_query, paper_parameter=parameter)
return _stmts_from_results(client=client, result=result, **kwargs)


Expand Down Expand Up @@ -1140,6 +1124,173 @@ def get_stmts_for_stmt_hashes(
}
return rv, evidence_counts

@autoclient()
def get_statements(
agent: Union[str, Tuple[str, str]],
*,
client: Neo4jClient,
rel_types: Optional[Union[str, List[str]]] = None,
stmt_sources: Optional[Union[str, List[str]]] = None,
agent_role: Optional[str] = None,
other_agent: Optional[Union[str, Tuple[str, str]]] = None,
other_role: Optional[str] = None,
paper_term: Optional[Tuple[str, str]] = None,
mesh_term: Optional[Tuple[str, str]] = None,
include_child_terms: Optional[bool] = True,
limit: Optional[int] = 10,
evidence_limit: Optional[int] = None,
return_evidence_counts: bool = False,
) -> Union[List[Statement], Tuple[List[Statement], Mapping[int, int]]]:
"""Return the statements based on optional constraints on relationship type and source(s).

Parameters
----------
client : Neo4jClient
The Neo4j client used for executing the query.
rel_types : Optional[Union[str, List[str]]], default: None
The relationship type(s) to filter by, e.g., "Phosphorylation" or ["Phosphorylation", "Activation"].
stmt_sources : Optional[Union[str, List[str]]], default: None
The source(s) to filter by, e.g., "reach" or ["reach", "sparser"].
agent : Union[str, Tuple[str, str]]
The primary agent involved in the interaction. Can be specified as a name (e.g., "EGFR") or as a CURIE
tuple (namespace, ID), such as ("MESH", "D051379").
agent_role : Optional[str], default: None
The role of agent in the interaction: either "subject", "object", or None for an undirected search.
other_agent : Optional[Union[str, Tuple[str, str]]], default: None
A secondary agent in the interaction, specified either as a name or CURIE tuple.
other_role : Optional[str], default: None
The role of other_agent in the interaction: either "subject", "object", or None.
paper_term: Optional[Tuple[str, str]], default : None
The paper filter. Can be a PubMed ID, PMC id, TRID, or DOI
mesh_term : Optional[Tuple[str, str]], default : None
The mesh_term filter for evidences
include_child_terms : Optional[bool], default : True
If True, also match against the child MESH terms of the given MESH term.
limit : Optional[int], default: 10
The maximum number of statements to return.
evidence_limit : Optional[int], default: None
The optional maximum number of evidence entries to retrieve per statement.
return_evidence_counts : bool, default: False
Whether to include a mapping of statement hash to evidence count in the results.

Returns
-------
List[Statement]
A list of statements filtered by the provided constraints.
"""
where_clauses = []
mesh_all_term, hash_in_rel = None, ""
if paper_term:
paper_param, paper_clause = generate_paper_clause(paper_term)
else:
paper_clause = None

if mesh_term or paper_term:
query = (f"MATCH (e:Evidence)-[:has_citation]->"
f"(pub:Publication {paper_clause})-[:annotated_with] "
f"-> (mesh_term:BioEntity)")
hash_in_rel = "{stmt_hash: e.stmt_hash}"
if mesh_term:
norm_mesh = norm_id(*mesh_term)
if include_child_terms:
child_terms = _get_mesh_child_terms(mesh_term, client=client)
else:
child_terms = set()
if child_terms:
mesh_all_term = {norm_mesh} | child_terms
mesh_all_term = list(mesh_all_term)
else:
mesh_all_term = [norm_mesh]
where_clauses.append("mesh_term.id IN $mesh_terms")
else:
query = ""

# Agent being CURIE
if isinstance(agent, tuple):
agent_constraint = norm_id(*agent)
agent_match_clause = f"(a:BioEntity {{id: $agent_constraint}})"
# Agent being text name
else:
agent_constraint = agent
agent_match_clause = f"(a:BioEntity {{name: $agent_constraint}})"

if isinstance(other_agent, tuple):
other_agent_constraint = norm_id(*other_agent)
other_agent_match_clause = f"(b:BioEntity {{id: $other_agent_constraint}})"
elif other_agent:
other_agent_constraint = other_agent
other_agent_match_clause = f"(b:BioEntity {{name: $other_agent_constraint}})"
else:
other_agent_match_clause = "(b:BioEntity)"

if agent_role == "subject" and other_role == "object":
match_clause = f"{agent_match_clause}-[r:indra_rel {hash_in_rel}]->{other_agent_match_clause}"
elif agent_role == "object" and other_role == "subject":
match_clause = f"{other_agent_match_clause}-[r:indra_rel {hash_in_rel}]->{agent_match_clause}"
elif agent_role == "subject":
match_clause = f"{agent_match_clause}-[r:indra_rel {hash_in_rel}]->{other_agent_match_clause}"
elif agent_role == "object":
match_clause = f"{other_agent_match_clause}-[r:indra_rel {hash_in_rel}]->{agent_match_clause}"
else:
match_clause = f"{agent_match_clause}-[r:indra_rel {hash_in_rel}]-{other_agent_match_clause}"

if rel_types:
if isinstance(rel_types, str):
rel_types = [rel_types]
where_clauses.append("r.stmt_type IN $rel_types")


if stmt_sources:
if isinstance(stmt_sources, str):
stmt_sources = [stmt_sources]
where_clauses.append("any(source IN $stmt_sources WHERE r.source_counts CONTAINS source)")

if where_clauses:
match_clause += " WHERE " + " AND ".join(where_clauses)

query += f"MATCH p = {match_clause} WITH distinct r.stmt_hash AS hash, collect(p) as pp RETURN pp LIMIT $limit"
params = {
"agent_constraint": agent_constraint,
"rel_types": rel_types if isinstance(rel_types, list) else [rel_types],
"limit": limit
}
if other_agent:
params["other_agent_constraint"] = other_agent_constraint
if mesh_all_term:
params["mesh_terms"] = mesh_all_term
if stmt_sources:
params['stmt_sources'] = stmt_sources
if paper_term:
params['paper_parameter'] = paper_param


logger.info(f"Running query with constraints: rel_type={rel_types}, "
f"source={stmt_sources}, agent={agent}, other_agent={other_agent}, "
f"agent_role={agent_role}, other_role={other_role}, limit={limit}")
logger.info(query)
rels = client.query_tx(query, **params)
flattened_rels = [client.neo4j_to_relation(i[0]) for rel in rels for i in rel]
stmts = indra_stmts_from_relations(flattened_rels, deduplicate=True)
if evidence_limit and evidence_limit > 1:
stmts = enrich_statements(
stmts,
client=client,
evidence_limit=evidence_limit,
)

if not return_evidence_counts:
return stmts

evidence_counts = {
stmt.get_hash(): (
min(rel.data["evidence_count"], evidence_limit)
if evidence_limit is not None
else rel.data["evidence_count"]
)
for rel, stmt in zip(flattened_rels, stmts)
}

return stmts, evidence_counts

@autoclient()
def enrich_statements(
Expand Down
30 changes: 29 additions & 1 deletion src/indra_cogex/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""Representations for nodes and relations to upload to Neo4j."""


__all__ = ["Node", "Relation", "indra_stmts_from_relations", "norm_id"]
__all__ = ["Node", "Relation", "indra_stmts_from_relations", "norm_id", "generate_paper_clause"]

import codecs
from typing import (
Expand Down Expand Up @@ -505,3 +505,31 @@ def indra_stmts_from_relations(rels: Iterable[Relation],
# We do it this way to not change the order of the statements
stmts = list({stmt.get_hash(): stmt for stmt in stmts}.values())
return stmts


def generate_paper_clause(paper_term: Tuple[str, str]):
"""
paper_term :
Can be a PubMed ID, PMC id, TRID, or DOI
"""
if paper_term[0].lower() in {"pmid", "pubmed"}:
parameter = norm_id(*paper_term)
publication_props = "{id: $paper_parameter}"

elif paper_term[0].lower() == "doi":
parameter = paper_term[1]
publication_props = "{doi: $paper_parameter}"

elif paper_term[0].lower() in {"pmc", "pmcid"}:
parameter = paper_term[1]
publication_props = "{pmcid: $paper_parameter}"

elif paper_term[0].lower() == "trid":
parameter = paper_term[1]
publication_props = "{trid: $paper_parameter}"

else:
raise ValueError(
f"Invalid prefix for publication lookup: {paper_term[0]}")

return parameter, publication_props
24 changes: 24 additions & 0 deletions tests/test_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,30 @@ def test_get_stmts_by_hashes():
assert stmts
assert isinstance(stmts[0], Statement)

@pytest.mark.nonpublic
def test_get_statements():
client = _get_client()
stmts, evidence_map = get_statements(
agent="MEK",
other_agent="ERK",
agent_role='subject',
other_role='object',
rel_types=["Phosphorylation", "Activation"],
stmt_sources='reach',
mesh_term=("MESH", 'D000818'),
paper_term=('pubmed', '23356518'),
client=client,
limit=1000,
evidence_limit=500,
return_evidence_counts=True
)
assert stmts
assert all(isinstance(stmt, Statement) for stmt in stmts)
assert evidence_map
stmt_hashes = {stmt.get_hash() for stmt in stmts}
assert stmt_hashes == evidence_map.keys()
for stmt_hash, evidence_count in evidence_map.items():
assert evidence_count <= 500

@pytest.mark.nonpublic
def test_is_gene_mutated():
Expand Down