Merge pull request #24 from bgyori/inputs

Implement node assembly and standardize identifiers
gyorilab · May 17, 2021 · 477df79 · 477df79
2 parents e9a4e1e + e7bb294
commit 477df79
Show file tree

Hide file tree

Showing 18 changed files with 379 additions and 150 deletions.
diff --git a/.flake8 b/.flake8
@@ -7,7 +7,7 @@ ignore =
     S403 # pickle
     S301 # pickle
     W503 # line break before binary operator
-    E203 # conflicts with black
+    S101 # Don't complain about asserts
 exclude =
     .tox,
     .git,

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -3,43 +3,6 @@ name: Tests
 on: [ push, pull_request ]
 
 jobs:
-  lint:
-    name: Lint
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [ 3.6, 3.9 ]
-        tox-env: [ manifest, flake8, pyroma, mypy ]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: pip install tox
-      - name: Run tox
-        run: tox -e ${{ matrix.tox-env }}
-  docs:
-    name: Documentation
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [ 3.6, 3.9 ]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: pip install tox
-      - name: Check RST conformity with doc8
-        run: tox -e doc8
-      # - name: Check docstring coverage
-      #   run: tox -e docstr-coverage
-      - name: Check documentation build with Sphinx
-        run: tox -e docs
   tests:
     name: Tests
     runs-on: ${{ matrix.os }}

diff --git a/setup.cfg b/setup.cfg
@@ -36,12 +36,13 @@ keywords =
 
 [options]
 install_requires =
-    indra
+    indra @ git+https://github.com/sorgerlab/indra.git
     neo4j
     click
     more_click
     class-resolver>=0.0.9
     pystow>=0.1.6
+    pyobo
 
 include_package_data = True
 python_requires = >=3.6

diff --git a/src/indra_cogex/__init__.py b/src/indra_cogex/__init__.py
diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py
@@ -0,0 +1,50 @@
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+from indra_cogex.representation import Node
+
+
+class NodeAssembler:
+    def __init__(self, nodes: Optional[List[Node]] = None):
+        self.nodes = nodes if nodes else []
+        self.conflicts: List[Conflict] = []
+
+    def add_nodes(self, nodes: List[Node]):
+        self.nodes += nodes
+
+    def assemble_nodes(self) -> List[Node]:
+        nodes_by_id = defaultdict(list)
+        for node in self.nodes:
+            nodes_by_id[(node.db_ns, node.db_id)].append(node)
+
+        assembled_nodes = [
+            self.get_aggregate_node(db_ns, db_id, node_group)
+            for (db_ns, db_id), node_group in nodes_by_id.items()
+        ]
+        return assembled_nodes
+
+    def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node:
+        labels = set()
+        data: Dict[str, str] = {}
+        for node in nodes:
+            labels |= set(node.labels)
+            for data_key, data_val in node.data.items():
+                previous_val = data.get(data_key)
+                if previous_val and previous_val != data_val:
+                    self.conflicts.append(Conflict(data_key, previous_val, data_val))
+                else:
+                    data[data_key] = data_val
+        return Node(db_ns, db_id, sorted(labels), data)
+
+
+class Conflict:
+    def __init__(self, key, val1, val2):
+        self.key = key
+        self.val1 = val1
+        self.val2 = val2
+
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return f"Conflict({self.key}, {self.val1}, {self.val2})"
diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py
@@ -12,28 +12,41 @@ class Node:
 
     def __init__(
         self,
-        identifier: str,
+        db_ns: str,
+        db_id: str,
         labels: Collection[str],
         data: Optional[Mapping[str, Any]] = None,
     ):
         """Initialize the node.
 
-        :param identifier: The identifier of the node
-        :param labels: The collection of labels for the relation.
-        :param data: The optional data dictionary associated with the node.
+        Parameters
+        ----------
+        db_ns :
+            The namespace associated with the node. Uses the INDRA standard.
+        db_id :
+            The identifier within the namespace associated with the node.
+            Uses the INDRA standard.
+        labels :
+            A collection of labels for the node.
+        data :
+            An optional data dictionary associated with the node.
         """
-        self.identifier = identifier
+        if not db_ns or not db_id:
+            raise ValueError("Invalid namespace or ID.")
+        self.db_ns = db_ns
+        self.db_id = db_id
         self.labels = labels
         self.data = data if data else {}
 
     def to_json(self):
         """Serialize the node to JSON."""
         data = {k: v for k, v in self.data.items()}
-        data["id"] = self.identifier
+        data["db_ns"] = self.db_ns
+        data["db_id"] = self.db_id
         return {"labels": self.labels, "data": data}
 
     def _get_data_str(self):
-        pieces = ["id:'%s'" % self.identifier]
+        pieces = ["id:'%s:%s'" % (self.db_ns, self.db_id)]
         for k, v in self.data.items():
             if isinstance(v, str):
                 value = "'" + v.replace("'", "\\'") + "'"
@@ -60,36 +73,44 @@ class Relation:
 
     def __init__(
         self,
+        source_ns: str,
         source_id: str,
+        target_ns: str,
         target_id: str,
-        labels: Collection[str],
+        rel_type: str,
         data: Optional[Mapping[str, Any]] = None,
     ):
         """Initialize the relation.
 
         :param source_id: The identifier of the source node
         :param target_id: The identifier of the target node
-        :param labels: The collection of labels for the relation.
+        :param rel_type: The relation's type.
         :param data: The optional data dictionary associated with the relation.
         """
+        self.source_ns = source_ns
         self.source_id = source_id
+        self.target_ns = target_ns
         self.target_id = target_id
-        self.labels = list(labels)
+        self.rel_type = rel_type
         self.data = data if data else {}
 
     def to_json(self):
         """Serialize the relation to JSON."""
         return {
+            "source_ns": self.source_ns,
             "source_id": self.source_id,
+            "target_ns": self.target_ns,
             "target_id": self.target_id,
-            "labels": self.labels,
+            "rel_type": self.rel_type,
             "data": self.data,
         }
 
     def __str__(self):  # noqa:D105
         data_str = ", ".join(["%s:'%s'" % (k, v) for k, v in self.data.items()])
-        labels_str = ":".join(self.labels)
-        return f"({self.source_id})-[:{labels_str} {data_str}]->" f"({self.target_id})"
+        return (
+            f"({self.source_ns}, {self.source_id})-[:{self.rel_type} {data_str}]->"
+            f"({self.target_ns}, {self.target_id})"
+        )
 
     def __repr__(self):  # noqa:D105
         return str(self)
diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py
@@ -32,20 +32,36 @@ def __init__(self, path: Union[None, str, Path] = None):
             self.expressions = pickle.load(fh)
 
     def get_nodes(self):  # noqa:D102
-        for context_id in self.expressions:
+        for context in self.expressions:
+            context_ns, context_id = get_context(context)
             yield Node(
+                context_ns,
                 context_id,
                 ["BioEntity"],
                 data={"name": pyobo.get_name_by_curie(context_id)},
             )
         for hgnc_id in set.union(*[set(v) for v in self.expressions.values()]):
             yield Node(
-                f"HGNC:{hgnc_id}",
+                "HGNC",
+                hgnc_id,
                 ["BioEntity"],
                 data={"name": pyobo.get_name("hgnc", hgnc_id)},
             )
 
     def get_relations(self):  # noqa:D102
-        for context_id, hgnc_ids in self.expressions.items():
+        data = {"source": self.name}
+        for context, hgnc_ids in self.expressions.items():
+            context_ns, context_id = get_context(context)
             for hgnc_id in hgnc_ids:
-                yield Relation(f"HGNC:{hgnc_id}", context_id, [self.rel_type])
+                yield Relation(
+                    "HGNC", hgnc_id, context_ns, context_id, self.rel_type, data
+                )
+
+
+def get_context(context):
+    context_ns, context_id = context.split(":", maxsplit=1)
+    if context_ns == "UBERON":
+        context_id = f"UBERON:{context_id}"
+    elif context_ns == "CL":
+        context_id = f"CL:{context_id}"
+    return context_ns, context_id
diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py
@@ -6,9 +6,12 @@
 from textwrap import dedent
 
 import click
+import pystow
 from more_click import verbose_option
 
 from . import processor_resolver
+from .processor import Processor
+from ..assembly import NodeAssembler
 
 
 @click.command()
@@ -32,6 +35,7 @@
 def main(load: bool, load_only: bool, force: bool):
     """Generate and import Neo4j nodes and edges tables."""
     paths = []
+    na = NodeAssembler()
     for processor_cls in processor_resolver:
         if not processor_cls.importable:
             continue
@@ -44,21 +48,34 @@ def main(load: bool, load_only: bool, force: bool):
             ):
                 click.secho("Processing...", fg="green")
                 processor = processor_cls()
+                # FIXME: this is redundant, we get nodes twice
+                na.add_nodes(list(processor.get_nodes()))
                 processor.dump()
         paths.append((processor_cls.nodes_path, processor_cls.edges_path))
 
+    # FIXME: This doesn't work unless the processors are also running and
+    # getting nodes
+    nodes_path = pystow.module("indra", "cogex", "assembled").join(name="nodes.tsv.gz")
+    if not load_only:
+        if force or not nodes_path.is_file():
+            # Now create and dump the assembled nodes
+            assembled_nodes = na.assemble_nodes()
+            assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id))
+            Processor._dump_nodes_to_path(assembled_nodes, nodes_path)
+
     if load or load_only:
         command = dedent(
-            """\
+            f"""\
         neo4j-admin import \\
           --database=indra \\
           --delimiter='TAB' \\
           --skip-duplicate-nodes=true \\
-          --skip-bad-relationships=true
+          --skip-bad-relationships=true \\
+          --nodes {nodes_path}
         """
         ).rstrip()
-        for node_path, edge_path in paths:
-            command += f"\\\n  --nodes {node_path} \\\n  --relationships {edge_path}"
+        for _, edge_path in paths:
+            command += f"\\\n  --relationships {edge_path}"
 
         click.secho("Running shell command:")
         click.secho(command, fg="blue")

diff --git a/src/indra_cogex/sources/goa/__init__.py b/src/indra_cogex/sources/goa/__init__.py
@@ -42,20 +42,17 @@ def __init__(self):
 
     def get_nodes(self):  # noqa:D102
         for go_node in self.df["GO_ID"].unique():
-            yield Node(go_node, ["BioEntity"])
+            yield Node("GO", go_node, ["BioEntity"])
         for hgnc_id in self.df["HGNC_ID"].unique():
-            yield Node(f"HGNC:{hgnc_id}", ["BioEntity"])
+            yield Node("HGNC", hgnc_id, ["BioEntity"])
 
     def get_relations(self):  # noqa:D102
         rel_type = "associated_with"
         for (go_id, hgnc_id), ecs in self.df.groupby(["GO_ID", "HGNC_ID"])["EC"]:
             all_ecs = ",".join(sorted(set(ecs)))
-            source = f"HGNC:{hgnc_id}"
-            # Note that we don't add the extra GO: by current convention
-            target = go_id
             # Possible properties could be e.g., evidence codes
-            data = {"evidence_codes:string": all_ecs}
-            yield Relation(source, target, [rel_type], data)
+            data = {"evidence_codes:string": all_ecs, "source": self.name}
+            yield Relation("HGNC", hgnc_id, "GO", go_id, rel_type, data)
 
 
 def load_goa(url: str) -> pd.DataFrame: