Skip to content

Commit

Permalink
Expanded enrichment notebook (#692)
Browse files Browse the repository at this point in the history
  • Loading branch information
cmungall authored Dec 21, 2023
1 parent b404cd2 commit 66998cd
Show file tree
Hide file tree
Showing 8 changed files with 828 additions and 1,838 deletions.
2,459 changes: 643 additions & 1,816 deletions notebooks/Commands/Enrichment.ipynb

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions notebooks/Commands/input/eds-genes-hgnc.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
id label
HGNC:11976 TNXB
HGNC:1246 C1R
HGNC:1247 C1S
HGNC:17978 B3GALT6
HGNC:18625 FKBP14
HGNC:20859 SLC39A13
HGNC:21144 DSE
HGNC:2188 COL12A1
HGNC:218 ADAMTS2
HGNC:2198 COL1A2
HGNC:2201 COL3A1
HGNC:2209 COL5A1
HGNC:2210 COL5A2
HGNC:23216 ZNF469
HGNC:24464 CHST14
HGNC:303 AEBP1
HGNC:9081 PLOD1
HGNC:930 B4GALT7
HGNC:9349 PRDM5
14 changes: 14 additions & 0 deletions src/oaklib/interfaces/association_provider_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,20 @@ def association_subject_counts(
Here objects are typically nodes from ontologies and subjects are annotated
entities such as genes.
>>> from oaklib import get_adapter
>>> from oaklib.datamodels.vocabulary import IS_A, PART_OF
>>> adapter = get_adapter("src/oaklib/conf/go-pombase-input-spec.yaml")
>>> genes = ["PomBase:SPAC1142.02c", "PomBase:SPAC3H1.05", "PomBase:SPAC1142.06"]
>>> preds = [IS_A, PART_OF]
>>> for term, num in adapter.association_subject_counts(genes, object_closure_predicates=preds):
... print(term, num)
<BLANKLINE>
...
GO:0051668 3
...
This shows that GO:0051668 (localization within membrane) is used for all 3 input subjects
:param subjects:
:param predicates:
:param property_filter:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ def enriched_classes(
GO:0006620 post-translational protein targeting to endoplasmic reticulum membrane
...
By default, results may include redundant terms. If we set `filter_redundant=True`,
then redundant terms are removed, unless they are more significant than the
descendant term
:param subjects: The set of entities to test for over-representation of classes
:param item_list: An item list objects as an alternate way to specify subjects
:param background: The set of entities to use as a background for the test (recommended)
Expand Down Expand Up @@ -84,6 +88,8 @@ def enriched_classes(
subjects, predicates=predicates, object_closure_predicates=object_closure_predicates
)
}
if all(v == 0 for v in sample_count.values()):
raise ValueError("No associations found for subjects")
potential_hypotheses = set(sample_count.keys())
if hypotheses is None:
hypotheses = potential_hypotheses
Expand Down
3 changes: 1 addition & 2 deletions src/oaklib/parsers/gaf_association_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ class GafAssociationParser(XafAssociationParser):
def post_process(
self, association: Association
) -> List[Union[Association, NegatedAssociation]]:
if association.predicate and "not" in association.predicate.lower():
# in future this may return a NegatedAssociation
if association.predicate and "NOT" in association.predicate.lower():
return [NegatedAssociation(**association.__dict__)]
return super().post_process(association)
17 changes: 16 additions & 1 deletion src/oaklib/parsers/xaf_association_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def parse(
):
raise ValueError(f"Unexpected default predicate {self.default_predicate_value} in file")
for line in file.readlines():
is_negated = False
if line.startswith(self.comment_character):
continue
line = line.rstrip()
Expand All @@ -179,6 +180,17 @@ def parse(
s = lookup_subject(vals)
p = lookup_predicate(vals)
o = lookup_object(vals)
if p:
ps = p.split("|")
p = None
for candidate in ps:
if candidate.lower() == "not":
is_negated = True
print(f"Negated association: {line}")
else:
if p:
raise ValueError(f"Unexpected predicate {p} in line: {line}")
p = candidate
if not p:
p = self.default_predicate_value
if self.subject_prefix_column:
Expand All @@ -194,7 +206,10 @@ def parse(
if s.startswith("MGI:MGI:"):
# TODO: make this more configurable
s = s.replace("MGI:MGI:", "MGI:")
association = Association(s, p, o)
if is_negated:
association = NegatedAssociation(s, p, o)
else:
association = Association(s, p, o)
if lookup_publications:
pub = lookup_publications(vals)
if pub:
Expand Down
91 changes: 84 additions & 7 deletions src/oaklib/utilities/obograph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,19 +182,25 @@ def filter_by_predicates(graph: Graph, predicates: List[PRED_CURIE], graph_id: s


def as_multi_digraph(
graph: Graph, reverse: bool = True, filter_reflexive: bool = True
graph: Graph,
reverse: bool = True,
filter_reflexive: bool = True,
predicates: Optional[List[PRED_CURIE]] = None,
) -> nx.MultiDiGraph:
"""
Convert to a networkx :class:`.MultiDiGraph`
:param graph: OBOGraph
:param reverse:
:return:
:param filter_reflexive: if true, remove edges where sub == obj
:param predicates: if not None, only include edges with these predicates
:return: networkx MultiDiGraph
"""
mdg = nx.MultiDiGraph()
for edge in graph.edges:
if filter_reflexive and reflexive(edge):
continue
if predicates is not None and edge.pred not in predicates:
continue
edge_attrs = {"predicate": edge.pred}
if reverse:
mdg.add_edge(edge.obj, edge.sub, **edge_attrs)
Expand Down Expand Up @@ -343,13 +349,72 @@ def shortest_paths(
logging.info(f"No path between {start_curie} and {end_curie}")


def depth_first_ordering(graph: Graph) -> List[CURIE]:
def edges_from_tree(tree: dict, pred=IS_A) -> List[Edge]:
"""
Given a parent node and a list of children, return a list of edges
>>> from oaklib.utilities.obograph_utils import edges_from_tree
>>> for e in edges_from_tree({1: {2: [3, 4]}}):
... print(e.sub, e.obj)
2 1
3 2
4 2
:param tree:
:param pred: defaults to IS_A
:return:
"""
edges = []

def _safe(x: Any):
return str(x)

def _edge(s: Any, o: Any) -> None:
edges.append(Edge(sub=str(s), pred=pred, obj=str(o)))

for parent, children in tree.items():
if isinstance(children, list):
# leaf nodes
for child in children:
_edge(child, parent)
else:
# non-leaf nodes
for child, grandchildren in children.items():
_edge(child, parent)
edges.extend(edges_from_tree({child: grandchildren}, pred=pred))
return edges


def depth_first_ordering(
graph: Graph, predicates: Optional[List[PRED_CURIE]] = None
) -> List[CURIE]:
"""
Return a depth-first ordering of the nodes in the graph.
:param graph:
>>> from oaklib.datamodels.obograph import Graph
>>> from oaklib.utilities.obograph_utils import depth_first_ordering, edges_from_tree
>>> ## Chains have a deterministic DF ordering
>>> edges = edges_from_tree({1: {2: [3]}})
>>> list(depth_first_ordering(Graph("test", edges=edges)))
['1', '2', '3']
>>> list(depth_first_ordering(Graph("test", edges=list(reversed(edges)))))
['1', '2', '3']
>>> edges2 = edges_from_tree({5: [3, 4]})
>>> ordered = list(depth_first_ordering(Graph("test", edges=edges + edges2)))
>>> assert ordered.index('1') < ordered.index('2')
:param graph: OBOGraph
:param predicates:
:return:
"""
g = as_digraph(graph, predicates=predicates)
roots = [n for n, d in g.in_degree() if d == 0]
ordered = []
for root in roots:
for n in nx.dfs_preorder_nodes(g, root):
if n not in ordered:
ordered.append(n)
return ordered
six = index_graph_edges_by_subject(graph)
oix = index_graph_edges_by_object(graph)
stack = list(set(oix.keys()) - set(six.keys()))
Expand Down Expand Up @@ -544,8 +609,20 @@ def index_graph_edges_by_predicate(graph: Graph) -> Dict[CURIE, List[Edge]]:
return d


def topological_sort(graph: Graph, predicates: List[PRED_CURIE]) -> List[CURIE]:
dg = as_multi_digraph(graph)
def topological_sort(graph: Graph, predicates: Optional[List[PRED_CURIE]]) -> List[CURIE]:
"""
Returns a topological sort of the graph.
A topological sort is a nonunique permutation of the nodes of a
directed graph such that an edge from u to v implies that u
appears before v in the topological sort order. This ordering is
valid only if the graph has no directed cycles.
:param graph:
:param predicates:
:return:
"""
dg = as_multi_digraph(graph, predicates=predicates)
return nx.topological_sort(dg)


Expand Down
56 changes: 44 additions & 12 deletions tests/test_utilities/test_obograph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@
IMBO,
INPUT_DIR,
INTRACELLULAR,
MEMBRANE,
NUCLEAR_MEMBRANE,
NUCLEUS,
ORGANELLE,
OUTPUT_DIR,
PLASMA_MEMBRANE,
VACUOLE,
)

Expand Down Expand Up @@ -158,6 +160,11 @@ def test_trim_ancestors(self):
raise NotImplementedError

def test_shortest_paths(self):
"""
Test that the shortest paths are correct.
:return:
"""
oi = self.oi
both = [IS_A, PART_OF]
hi = 1.0
Expand Down Expand Up @@ -188,17 +195,42 @@ def test_shortest_paths(self):
self.assertNotIn(x, path)

def test_depth_first_ordering(self):
"""
Test that the depth first ordering of the graph is correct.
Note that DF ordering may be non-deterministic if the graph is not a tree.
This test conservatively checks conditions that are guaranteed to hold
even with DAGs
:return:
"""
oi = self.oi
graph = oi.descendant_graph([CELLULAR_COMPONENT], predicates=[IS_A, PART_OF])
ordered = depth_first_ordering(graph)
self.assertEqual(ordered[0], CELLULAR_COMPONENT)
expected_order = [
(CELLULAR_COMPONENT, CELLULAR_ANATOMICAL_ENTITY),
(CELLULAR_ANATOMICAL_ENTITY, ORGANELLE),
(ORGANELLE, NUCLEUS),
# (CYTOPLASM, NUCLEUS),
(IMBO, NUCLEUS),
(NUCLEUS, NUCLEAR_MEMBRANE),
expected = [
(
[CELLULAR_COMPONENT],
[IS_A, PART_OF],
[
(CELLULAR_COMPONENT, CELLULAR_ANATOMICAL_ENTITY),
(CELLULAR_ANATOMICAL_ENTITY, ORGANELLE),
(CELLULAR_ANATOMICAL_ENTITY, NUCLEUS),
],
),
(
[CELLULAR_COMPONENT],
[IS_A],
[
(CELLULAR_COMPONENT, CELLULAR_ANATOMICAL_ENTITY),
(CELLULAR_ANATOMICAL_ENTITY, ORGANELLE),
(CELLULAR_ANATOMICAL_ENTITY, NUCLEUS),
(CELLULAR_ANATOMICAL_ENTITY, MEMBRANE),
(MEMBRANE, PLASMA_MEMBRANE),
],
),
]
for parent, child in expected_order:
self.assertLess(ordered.index(parent), ordered.index(child), f"{parent} -> {child}")
for starts, preds, expected_order in expected:
graph = oi.descendant_graph(starts, predicates=preds)
ordered = depth_first_ordering(graph)
if len(starts) == 1:
self.assertEqual(ordered[0], starts[0])
for parent, child in expected_order:
self.assertLess(ordered.index(parent), ordered.index(child), f"{parent} -> {child}")

0 comments on commit 66998cd

Please sign in to comment.