Expanded enrichment notebook (#692)

INCATools · Dec 21, 2023 · 66998cd · 66998cd
1 parent b404cd2
commit 66998cd
Show file tree

Hide file tree

Showing 8 changed files with 828 additions and 1,838 deletions.
diff --git a/notebooks/Commands/Enrichment.ipynb b/notebooks/Commands/Enrichment.ipynb
diff --git a/notebooks/Commands/input/eds-genes-hgnc.tsv b/notebooks/Commands/input/eds-genes-hgnc.tsv
@@ -0,0 +1,20 @@
+id	label
+HGNC:11976	TNXB
+HGNC:1246	C1R
+HGNC:1247	C1S
+HGNC:17978	B3GALT6
+HGNC:18625	FKBP14
+HGNC:20859	SLC39A13
+HGNC:21144	DSE
+HGNC:2188	COL12A1
+HGNC:218	ADAMTS2
+HGNC:2198	COL1A2
+HGNC:2201	COL3A1
+HGNC:2209	COL5A1
+HGNC:2210	COL5A2
+HGNC:23216	ZNF469
+HGNC:24464	CHST14
+HGNC:303	AEBP1
+HGNC:9081	PLOD1
+HGNC:930	B4GALT7
+HGNC:9349	PRDM5
diff --git a/src/oaklib/interfaces/association_provider_interface.py b/src/oaklib/interfaces/association_provider_interface.py
@@ -433,6 +433,20 @@ def association_subject_counts(
         Here objects are typically nodes from ontologies and subjects are annotated
         entities such as genes.
 
+        >>> from oaklib import get_adapter
+        >>> from oaklib.datamodels.vocabulary import IS_A, PART_OF
+        >>> adapter = get_adapter("src/oaklib/conf/go-pombase-input-spec.yaml")
+        >>> genes = ["PomBase:SPAC1142.02c", "PomBase:SPAC3H1.05", "PomBase:SPAC1142.06"]
+        >>> preds = [IS_A, PART_OF]
+        >>> for term, num in adapter.association_subject_counts(genes, object_closure_predicates=preds):
+        ...    print(term, num)
+        <BLANKLINE>
+        ...
+        GO:0051668 3
+        ...
+
+        This shows that GO:0051668 (localization within membrane) is used for all 3 input subjects
+
         :param subjects:
         :param predicates:
         :param property_filter:

diff --git a/src/oaklib/interfaces/class_enrichment_calculation_interface.py b/src/oaklib/interfaces/class_enrichment_calculation_interface.py
@@ -53,6 +53,10 @@ def enriched_classes(
         GO:0006620 post-translational protein targeting to endoplasmic reticulum membrane
         ...
 
+        By default, results may include redundant terms. If we set `filter_redundant=True`,
+        then redundant terms are removed, unless they are more significant than the
+        descendant term
+
         :param subjects: The set of entities to test for over-representation of classes
         :param item_list: An item list objects as an alternate way to specify subjects
         :param background: The set of entities to use as a background for the test (recommended)
@@ -84,6 +88,8 @@ def enriched_classes(
                 subjects, predicates=predicates, object_closure_predicates=object_closure_predicates
             )
         }
+        if all(v == 0 for v in sample_count.values()):
+            raise ValueError("No associations found for subjects")
         potential_hypotheses = set(sample_count.keys())
         if hypotheses is None:
             hypotheses = potential_hypotheses

diff --git a/src/oaklib/parsers/gaf_association_parser.py b/src/oaklib/parsers/gaf_association_parser.py
@@ -48,7 +48,6 @@ class GafAssociationParser(XafAssociationParser):
     def post_process(
         self, association: Association
     ) -> List[Union[Association, NegatedAssociation]]:
-        if association.predicate and "not" in association.predicate.lower():
-            # in future this may return a NegatedAssociation
+        if association.predicate and "NOT" in association.predicate.lower():
             return [NegatedAssociation(**association.__dict__)]
         return super().post_process(association)
diff --git a/src/oaklib/parsers/xaf_association_parser.py b/src/oaklib/parsers/xaf_association_parser.py
@@ -171,6 +171,7 @@ def parse(
         ):
             raise ValueError(f"Unexpected default predicate {self.default_predicate_value} in file")
         for line in file.readlines():
+            is_negated = False
             if line.startswith(self.comment_character):
                 continue
             line = line.rstrip()
@@ -179,6 +180,17 @@ def parse(
             s = lookup_subject(vals)
             p = lookup_predicate(vals)
             o = lookup_object(vals)
+            if p:
+                ps = p.split("|")
+                p = None
+                for candidate in ps:
+                    if candidate.lower() == "not":
+                        is_negated = True
+                        print(f"Negated association: {line}")
+                    else:
+                        if p:
+                            raise ValueError(f"Unexpected predicate {p} in line: {line}")
+                        p = candidate
             if not p:
                 p = self.default_predicate_value
             if self.subject_prefix_column:
@@ -194,7 +206,10 @@ def parse(
             if s.startswith("MGI:MGI:"):
                 # TODO: make this more configurable
                 s = s.replace("MGI:MGI:", "MGI:")
-            association = Association(s, p, o)
+            if is_negated:
+                association = NegatedAssociation(s, p, o)
+            else:
+                association = Association(s, p, o)
             if lookup_publications:
                 pub = lookup_publications(vals)
                 if pub:

diff --git a/src/oaklib/utilities/obograph_utils.py b/src/oaklib/utilities/obograph_utils.py
@@ -182,19 +182,25 @@ def filter_by_predicates(graph: Graph, predicates: List[PRED_CURIE], graph_id: s
 
 
 def as_multi_digraph(
-    graph: Graph, reverse: bool = True, filter_reflexive: bool = True
+    graph: Graph,
+    reverse: bool = True,
+    filter_reflexive: bool = True,
+    predicates: Optional[List[PRED_CURIE]] = None,
 ) -> nx.MultiDiGraph:
     """
     Convert to a networkx :class:`.MultiDiGraph`
 
     :param graph: OBOGraph
-    :param reverse:
-    :return:
+    :param filter_reflexive: if true, remove edges where sub == obj
+    :param predicates: if not None, only include edges with these predicates
+    :return: networkx MultiDiGraph
     """
     mdg = nx.MultiDiGraph()
     for edge in graph.edges:
         if filter_reflexive and reflexive(edge):
             continue
+        if predicates is not None and edge.pred not in predicates:
+            continue
         edge_attrs = {"predicate": edge.pred}
         if reverse:
             mdg.add_edge(edge.obj, edge.sub, **edge_attrs)
@@ -343,13 +349,72 @@ def shortest_paths(
                 logging.info(f"No path between {start_curie} and {end_curie}")
 
 
-def depth_first_ordering(graph: Graph) -> List[CURIE]:
+def edges_from_tree(tree: dict, pred=IS_A) -> List[Edge]:
+    """
+    Given a parent node and a list of children, return a list of edges
+
+    >>> from oaklib.utilities.obograph_utils import edges_from_tree
+    >>> for e in edges_from_tree({1: {2: [3, 4]}}):
+    ...    print(e.sub, e.obj)
+    2 1
+    3 2
+    4 2
+
+    :param tree:
+    :param pred: defaults to IS_A
+    :return:
+    """
+    edges = []
+
+    def _safe(x: Any):
+        return str(x)
+
+    def _edge(s: Any, o: Any) -> None:
+        edges.append(Edge(sub=str(s), pred=pred, obj=str(o)))
+
+    for parent, children in tree.items():
+        if isinstance(children, list):
+            # leaf nodes
+            for child in children:
+                _edge(child, parent)
+        else:
+            # non-leaf nodes
+            for child, grandchildren in children.items():
+                _edge(child, parent)
+                edges.extend(edges_from_tree({child: grandchildren}, pred=pred))
+    return edges
+
+
+def depth_first_ordering(
+    graph: Graph, predicates: Optional[List[PRED_CURIE]] = None
+) -> List[CURIE]:
     """
     Return a depth-first ordering of the nodes in the graph.
 
-    :param graph:
+    >>> from oaklib.datamodels.obograph import Graph
+    >>> from oaklib.utilities.obograph_utils import depth_first_ordering, edges_from_tree
+    >>> ## Chains have a deterministic DF ordering
+    >>> edges = edges_from_tree({1: {2: [3]}})
+    >>> list(depth_first_ordering(Graph("test", edges=edges)))
+    ['1', '2', '3']
+    >>> list(depth_first_ordering(Graph("test", edges=list(reversed(edges)))))
+    ['1', '2', '3']
+    >>> edges2 = edges_from_tree({5: [3, 4]})
+    >>> ordered = list(depth_first_ordering(Graph("test", edges=edges + edges2)))
+    >>> assert ordered.index('1') < ordered.index('2')
+
+    :param graph: OBOGraph
+    :param predicates:
     :return:
     """
+    g = as_digraph(graph, predicates=predicates)
+    roots = [n for n, d in g.in_degree() if d == 0]
+    ordered = []
+    for root in roots:
+        for n in nx.dfs_preorder_nodes(g, root):
+            if n not in ordered:
+                ordered.append(n)
+    return ordered
     six = index_graph_edges_by_subject(graph)
     oix = index_graph_edges_by_object(graph)
     stack = list(set(oix.keys()) - set(six.keys()))
@@ -544,8 +609,20 @@ def index_graph_edges_by_predicate(graph: Graph) -> Dict[CURIE, List[Edge]]:
     return d
 
 
-def topological_sort(graph: Graph, predicates: List[PRED_CURIE]) -> List[CURIE]:
-    dg = as_multi_digraph(graph)
+def topological_sort(graph: Graph, predicates: Optional[List[PRED_CURIE]]) -> List[CURIE]:
+    """
+    Returns a topological sort of the graph.
+
+    A topological sort is a nonunique permutation of the nodes of a
+    directed graph such that an edge from u to v implies that u
+    appears before v in the topological sort order. This ordering is
+    valid only if the graph has no directed cycles.
+
+    :param graph:
+    :param predicates:
+    :return:
+    """
+    dg = as_multi_digraph(graph, predicates=predicates)
     return nx.topological_sort(dg)
 
 

diff --git a/tests/test_utilities/test_obograph_utils.py b/tests/test_utilities/test_obograph_utils.py
@@ -31,10 +31,12 @@
     IMBO,
     INPUT_DIR,
     INTRACELLULAR,
+    MEMBRANE,
     NUCLEAR_MEMBRANE,
     NUCLEUS,
     ORGANELLE,
     OUTPUT_DIR,
+    PLASMA_MEMBRANE,
     VACUOLE,
 )
 
@@ -158,6 +160,11 @@ def test_trim_ancestors(self):
             raise NotImplementedError
 
     def test_shortest_paths(self):
+        """
+        Test that the shortest paths are correct.
+
+        :return:
+        """
         oi = self.oi
         both = [IS_A, PART_OF]
         hi = 1.0
@@ -188,17 +195,42 @@ def test_shortest_paths(self):
                         self.assertNotIn(x, path)
 
     def test_depth_first_ordering(self):
+        """
+        Test that the depth first ordering of the graph is correct.
+
+        Note that DF ordering may be non-deterministic if the graph is not a tree.
+        This test conservatively checks conditions that are guaranteed to hold
+        even with DAGs
+
+        :return:
+        """
         oi = self.oi
-        graph = oi.descendant_graph([CELLULAR_COMPONENT], predicates=[IS_A, PART_OF])
-        ordered = depth_first_ordering(graph)
-        self.assertEqual(ordered[0], CELLULAR_COMPONENT)
-        expected_order = [
-            (CELLULAR_COMPONENT, CELLULAR_ANATOMICAL_ENTITY),
-            (CELLULAR_ANATOMICAL_ENTITY, ORGANELLE),
-            (ORGANELLE, NUCLEUS),
-            # (CYTOPLASM, NUCLEUS),
-            (IMBO, NUCLEUS),
-            (NUCLEUS, NUCLEAR_MEMBRANE),
+        expected = [
+            (
+                [CELLULAR_COMPONENT],
+                [IS_A, PART_OF],
+                [
+                    (CELLULAR_COMPONENT, CELLULAR_ANATOMICAL_ENTITY),
+                    (CELLULAR_ANATOMICAL_ENTITY, ORGANELLE),
+                    (CELLULAR_ANATOMICAL_ENTITY, NUCLEUS),
+                ],
+            ),
+            (
+                [CELLULAR_COMPONENT],
+                [IS_A],
+                [
+                    (CELLULAR_COMPONENT, CELLULAR_ANATOMICAL_ENTITY),
+                    (CELLULAR_ANATOMICAL_ENTITY, ORGANELLE),
+                    (CELLULAR_ANATOMICAL_ENTITY, NUCLEUS),
+                    (CELLULAR_ANATOMICAL_ENTITY, MEMBRANE),
+                    (MEMBRANE, PLASMA_MEMBRANE),
+                ],
+            ),
         ]
-        for parent, child in expected_order:
-            self.assertLess(ordered.index(parent), ordered.index(child), f"{parent} -> {child}")
+        for starts, preds, expected_order in expected:
+            graph = oi.descendant_graph(starts, predicates=preds)
+            ordered = depth_first_ordering(graph)
+            if len(starts) == 1:
+                self.assertEqual(ordered[0], starts[0])
+            for parent, child in expected_order:
+                self.assertLess(ordered.index(parent), ordered.index(child), f"{parent} -> {child}")