From 0acc17d7dadd3efc09c53634060ed62ef38f5881 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 11:31:20 -0400
Subject: [PATCH 01/34] Implement initial node assembler

---
 src/indra_cogex/assembly/__init__.py | 50 ++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 src/indra_cogex/assembly/__init__.py

diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py
new file mode 100644
index 000000000..dcf9b88a3
--- /dev/null
+++ b/src/indra_cogex/assembly/__init__.py
@@ -0,0 +1,50 @@
+from collections import defaultdict
+from typing import List
+from indra_cogex.representation import Node
+
+
+class NodeAssembler:
+    def __init__(self, nodes: List[Node]):
+        self.nodes = nodes
+        self.conflicts = []
+
+    def add_nodes(self, nodes: List[Node]):
+        self.nodes += nodes
+
+    def assemble_nodes(self) -> List[Node]:
+        nodes_by_id = defaultdict(list)
+        for node in self.nodes:
+            nodes_by_id[node.identifier].append(node)
+
+        assembled_nodes = [
+            self.get_aggregate_node(identifier, node_group)
+            for identifier, node_group in nodes_by_id.items()
+        ]
+        return assembled_nodes
+
+    def get_aggregate_node(self, identifier: str, nodes: List[Node]) -> Node:
+        labels = set()
+        data = {}
+        for node in nodes:
+            labels |= node.labels
+            for data_key, data_val in node.data.items():
+                previous_val = data.get(data_key)
+                if previous_val and previous_val != data_val:
+                    self.conflicts.append(
+                        Conflict(f"{data_key}:{previous_val}"), f"{data_key}:{data_val}"
+                    )
+                else:
+                    data[data_key] = data_val
+        return Node(identifier, labels, data)
+
+
+class Conflict:
+    def __init__(self, first, second):
+        self.first = first
+        self.second = second
+
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return f"Conflict({self.first}, {self.second})"

From 1fe8a0c5f92a765b1144ef6ba8be8a0c2188d097 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 11:32:45 -0400
Subject: [PATCH 02/34] Split namespace and ID in representation

---
 src/indra_cogex/representation.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py
index 0f558509f..8e24e0a53 100644
--- a/src/indra_cogex/representation.py
+++ b/src/indra_cogex/representation.py
@@ -12,28 +12,39 @@ class Node:
 
     def __init__(
         self,
-        identifier: str,
+        db_ns: str,
+        db_id: str,
         labels: Collection[str],
         data: Optional[Mapping[str, Any]] = None,
     ):
         """Initialize the node.
 
-        :param identifier: The identifier of the node
-        :param labels: The collection of labels for the relation.
-        :param data: The optional data dictionary associated with the node.
+        Parameters
+        ----------
+        db_ns :
+            The namespace associated with the node. Uses the INDRA standard.
+        db_id :
+            The identifier within the namespace associated with the node.
+            Uses the INDRA standard.
+        labels :
+            A collection of labels for the node.
+        data :
+            An optional data dictionary associated with the node.
         """
-        self.identifier = identifier
+        self.db_ns = db_ns
+        self.db_id = db_id
         self.labels = labels
         self.data = data if data else {}
 
     def to_json(self):
         """Serialize the node to JSON."""
         data = {k: v for k, v in self.data.items()}
-        data["id"] = self.identifier
+        data["db_ns"] = self.db_ns
+        data["db_id"] = self.db_id
         return {"labels": self.labels, "data": data}
 
     def _get_data_str(self):
-        pieces = ["id:'%s'" % self.identifier]
+        pieces = ["id:'%s:%s'" % (self.db_ns, self.db_id)]
         for k, v in self.data.items():
             if isinstance(v, str):
                 value = "'" + v.replace("'", "\\'") + "'"
@@ -60,7 +71,9 @@ class Relation:
 
     def __init__(
         self,
+        source_ns: str,
         source_id: str,
+        target_ns: str,
         target_id: str,
         labels: Collection[str],
         data: Optional[Mapping[str, Any]] = None,
@@ -72,7 +85,9 @@ def __init__(
         :param labels: The collection of labels for the relation.
         :param data: The optional data dictionary associated with the relation.
         """
+        self.source_ns = source_ns
         self.source_id = source_id
+        self.target_ns = target_ns
         self.target_id = target_id
         self.labels = list(labels)
         self.data = data if data else {}
@@ -80,7 +95,9 @@ def __init__(
     def to_json(self):
         """Serialize the relation to JSON."""
         return {
+            "source_ns": self.source_ns,
             "source_id": self.source_id,
+            "target_ns": self.target_ns,
             "target_id": self.target_id,
             "labels": self.labels,
             "data": self.data,

From 9c1d8e6a490147b8fe7faa5cde3d4967da1a9f3c Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 11:48:04 -0400
Subject: [PATCH 03/34] Update pathway source identifiers

---
 src/indra_cogex/sources/pathways/__init__.py | 33 ++++++++++++++++++--
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py
index 86d798c3c..770073a48 100644
--- a/src/indra_cogex/sources/pathways/__init__.py
+++ b/src/indra_cogex/sources/pathways/__init__.py
@@ -5,6 +5,9 @@
 import logging
 from typing import ClassVar
 
+from indra.databases import hgnc_client
+from indra.databases import uniprot_client
+from indra.databases.identifiers import get_ns_id_from_identifiers
 import pyobo
 import pyobo.api.utils
 from pyobo.struct import has_part
@@ -31,8 +34,10 @@ def get_nodes(self):  # noqa:D102
         # TODO add license
         version = pyobo.api.utils.get_version(self.prefix)
         for identifier, name in pyobo.get_id_name_mapping("wikipathways").items():
+            db_ns, db_id = get_ns_id_from_identifiers(self.prefix, identifier)
             yield Node(
-                f"{self.prefix}:{identifier}",
+                db_ns,
+                db_id,
                 ["BioEntity"],
                 dict(name=name, version=version),
             )
@@ -40,12 +45,34 @@ def get_nodes(self):  # noqa:D102
     def get_relations(self):  # noqa:D102
         df = pyobo.get_filtered_relations_df(self.prefix, self.relation)
         for identifier, t_prefix, t_identifier in df.values:
+            pathway_ns, pathway_id = get_ns_id_from_identifiers(self.prefix, identifier)
+            gene_ns, gene_id = self.get_gene(t_prefix, t_identifier)
+            if not gene_ns:
+                continue
             yield Relation(
-                f"{self.prefix}:{identifier}",
-                f"{t_prefix}:{t_identifier}",
+                pathway_ns,
+                pathway_id,
+                gene_ns,
+                gene_id,
                 [self.relation_label],
+                dict(source=self.name),
             )
 
+    def get_gene(self, prefix, identifier):
+        if prefix == "ncbigene":
+            hgnc_id = hgnc_client.get_hgnc_from_entrez(identifier)
+            if hgnc_id:
+                return "HGNC", hgnc_id
+            else:
+                return "EGID", identifier
+        elif prefix == "uniprot":
+            hgnc_id = uniprot_client.get_hgnc_id(identifier)
+            if hgnc_id:
+                return "HGNC", hgnc_id
+            else:
+                return "UP", identifier
+        return None, None
+
 
 class WikipathwaysProcessor(PyoboProcessor):
     """Processor for WikiPathways gene-pathway links."""

From ab08e7d290b66eacf1278fe309f175b5b5858241 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 11:48:43 -0400
Subject: [PATCH 04/34] Update namespace/ID standards in sources

---
 src/indra_cogex/sources/bgee/__init__.py         | 13 +++++++++----
 src/indra_cogex/sources/goa/__init__.py          |  9 +++------
 .../sources/indra_ontology/__init__.py           | 16 +++++++---------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py
index 0eb6c2dfc..3983d3161 100644
--- a/src/indra_cogex/sources/bgee/__init__.py
+++ b/src/indra_cogex/sources/bgee/__init__.py
@@ -34,18 +34,23 @@ def __init__(self, path: Union[None, str, Path] = None):
     def get_nodes(self):  # noqa:D102
         for context_id in self.expressions:
             yield Node(
-                context_id,
+                *context_id.split(":", maxsplit=1),
                 ["BioEntity"],
                 data={"name": pyobo.get_name_by_curie(context_id)},
             )
         for hgnc_id in set.union(*[set(v) for v in self.expressions.values()]):
             yield Node(
-                f"HGNC:{hgnc_id}",
+                "HGNC",
+                hgnc_id,
                 ["BioEntity"],
                 data={"name": pyobo.get_name("hgnc", hgnc_id)},
             )
 
     def get_relations(self):  # noqa:D102
-        for context_id, hgnc_ids in self.expressions.items():
+        data = {"source": self.name}
+        for context, hgnc_ids in self.expressions.items():
+            context_ns, context_id = context.split(":", maxsplit=1)
             for hgnc_id in hgnc_ids:
-                yield Relation(f"HGNC:{hgnc_id}", context_id, [self.rel_type])
+                yield Relation(
+                    "HGNC", hgnc_id, context_ns, context_id, [self.rel_type], data
+                )
diff --git a/src/indra_cogex/sources/goa/__init__.py b/src/indra_cogex/sources/goa/__init__.py
index f67fe2f1d..bff2452ea 100644
--- a/src/indra_cogex/sources/goa/__init__.py
+++ b/src/indra_cogex/sources/goa/__init__.py
@@ -44,18 +44,15 @@ def get_nodes(self):  # noqa:D102
         for go_node in self.df["GO_ID"].unique():
             yield Node(go_node, ["BioEntity"])
         for hgnc_id in self.df["HGNC_ID"].unique():
-            yield Node(f"HGNC:{hgnc_id}", ["BioEntity"])
+            yield Node("HGNC", hgnc_id, ["BioEntity"])
 
     def get_relations(self):  # noqa:D102
         rel_type = "associated_with"
         for (go_id, hgnc_id), ecs in self.df.groupby(["GO_ID", "HGNC_ID"])["EC"]:
             all_ecs = ",".join(sorted(set(ecs)))
-            source = f"HGNC:{hgnc_id}"
-            # Note that we don't add the extra GO: by current convention
-            target = go_id
             # Possible properties could be e.g., evidence codes
-            data = {"evidence_codes:string": all_ecs}
-            yield Relation(source, target, [rel_type], data)
+            data = {"evidence_codes:string": all_ecs, "source": self.name}
+            yield Relation("HGNC", hgnc_id, "GO", go_id, [rel_type], data)
 
 
 def load_goa(url: str) -> pd.DataFrame:
diff --git a/src/indra_cogex/sources/indra_ontology/__init__.py b/src/indra_cogex/sources/indra_ontology/__init__.py
index 928a7919f..8f69957f1 100644
--- a/src/indra_cogex/sources/indra_ontology/__init__.py
+++ b/src/indra_cogex/sources/indra_ontology/__init__.py
@@ -34,17 +34,15 @@ def __init__(self, ontology: Optional[IndraOntology] = None):
 
     def get_nodes(self):  # noqa:D102
         for node, data in self.ontology.nodes(data=True):
-            yield Node(_norm(node), ["BioEntity"], data)
+            db_ns, db_id = self.ontology.get_ns_id(node)
+            yield Node(db_ns, db_id, ["BioEntity"], data)
 
     def get_relations(self):  # noqa:D102
         for source, target, data in self.ontology.edges(data=True):
+            source_ns, source_id = self.ontology.get_ns_id(source)
+            target_ns, target_id = self.ontology.get_ns_id(target)
             data = copy.copy(data)
             edge_type = data.pop("type")
-            yield Relation(_norm(source), _norm(target), [edge_type], data)
-
-
-def _norm(node: str) -> str:
-    ns, identifier = node.split(":", 1)
-    if identifier.startswith(f"{ns}:"):
-        identifier = identifier[len(ns) + 1 :]
-    return f"{ns}:{identifier}"
+            yield Relation(
+                source_ns, source_id, target_ns, target_id, [edge_type], data
+            )

From 2a92bed18a630fb3a9f9a40f4ef3968238a36e53 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 15:02:07 -0400
Subject: [PATCH 05/34] Restructure node generation

---
 src/indra_cogex/sources/indra_db/__init__.py | 34 ++++++++------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py
index db0d4fb01..3f52c843c 100644
--- a/src/indra_cogex/sources/indra_db/__init__.py
+++ b/src/indra_cogex/sources/indra_db/__init__.py
@@ -42,12 +42,6 @@ def __init__(self, path: Union[None, str, Path] = None):
         logger.info("Loaded %s rows from %s", humanize.intword(len(df)), path)
         self.df = df
         for side in "AB":
-            self.df[side] = [
-                f"{prefix}:{identifier}"
-                for prefix, identifier in self.df[
-                    [f"ag{side}_ns", f"ag{side}_id"]
-                ].values
-            ]
             # A lot of the names in the SIF dump are all over
             self.df[f"ag{side}_name"] = [
                 bio_ontology.get_name(prefix, identifier)
@@ -57,21 +51,19 @@ def __init__(self, path: Union[None, str, Path] = None):
             ]
 
     def get_nodes(self):  # noqa:D102
-        df = (
-            pd.concat([self._get_nodes("A"), self._get_nodes("B")], ignore_index=True)
-            .drop_duplicates()
-            .sort_values("curie")
-        )
-        for curie, name in df.values:
-            yield Node(curie, ["BioEntity"], dict(name=name))
-
-    def _get_nodes(self, side: str) -> pd.DataFrame:
-        return self.df[[side, f"ag{side}_name"]].rename(
-            columns={
-                side: "curie",
-                f"ag{side}_name": "name",
-            }
-        )
+        df = pd.concat(
+            [
+                self.df[["agA_ns", "agA_id", "agA_name"]].rename(
+                    {"agA_ns": "ns", "agA_id": "id", "agA_name": "name"}
+                ),
+                self.df[["agB_ns", "agB_id", "agB_name"]].rename(
+                    {"agB_ns": "ns", "agB_id": "id", "agB_name": "name"}
+                ),
+            ],
+            ignore_index=True,
+        ).drop_duplicates()
+        for db_ns, db_id, name in df.values:
+            yield Node(db_ns, db_id, ["BioEntity"], dict(name=name))
 
     def get_relations(self):  # noqa:D102
         columns = ["A", "B", "stmt_type", "evidence_count", "stmt_hash"]

From 2e489024634f92e89d387f2ba07fa5515303c64f Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 15:16:07 -0400
Subject: [PATCH 06/34] Restructure relation generation

---
 src/indra_cogex/sources/indra_db/__init__.py | 31 +++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py
index 3f52c843c..a943f965d 100644
--- a/src/indra_cogex/sources/indra_db/__init__.py
+++ b/src/indra_cogex/sources/indra_db/__init__.py
@@ -66,9 +66,32 @@ def get_nodes(self):  # noqa:D102
             yield Node(db_ns, db_id, ["BioEntity"], dict(name=name))
 
     def get_relations(self):  # noqa:D102
-        columns = ["A", "B", "stmt_type", "evidence_count", "stmt_hash"]
-        for source, target, stmt_type, ev_count, stmt_hash in (
+        columns = [
+            "agA_ns",
+            "agA_id",
+            "agB_ns",
+            "agB_id",
+            "stmt_type",
+            "source_counts",
+            "stmt_hash",
+        ]
+        for (
+            source_ns,
+            source_id,
+            target_ns,
+            target_id,
+            stmt_type,
+            source_counts,
+            stmt_hash,
+        ) in (
             self.df[columns].drop_duplicates().values
         ):
-            data = {"stmt_hash:long": stmt_hash, "evidence_count:long": ev_count}
-            yield Relation(source, target, [stmt_type], data)
+            data = {"stmt_hash:long": stmt_hash, "evidence_count:str": source_counts}
+            yield Relation(
+                source_ns,
+                source_id,
+                target_ns,
+                target_id,
+                [stmt_type, "Statement"],
+                data,
+            )

From e554bcf98e35a350e074cfd6a2ed55bc90c5b88c Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 15:27:28 -0400
Subject: [PATCH 07/34] Reformat processor with new ID structure

---
 src/indra_cogex/sources/processor.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py
index 90d64fece..4eb743ca9 100644
--- a/src/indra_cogex/sources/processor.py
+++ b/src/indra_cogex/sources/processor.py
@@ -14,6 +14,8 @@
 from more_click import verbose_option
 from tqdm import tqdm
 
+from indra.databases import identifiers
+
 from indra_cogex.representation import Node, Relation
 
 __all__ = [
@@ -76,11 +78,11 @@ def _dump_nodes(self) -> Path:
         if self.nodes_path.is_file():
             return self.nodes_path
 
-        nodes = sorted(self.get_nodes(), key=attrgetter("identifier"))
+        nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id))
         metadata = sorted(set(key for node in nodes for key in node.data))
         node_rows = (
             (
-                node.identifier,
+                norm_id(node.db_ns, node.db_id),
                 "|".join(node.labels),
                 *[node.data.get(key, "") for key in metadata],
             )
@@ -126,12 +128,14 @@ def _dump_edges(self) -> Path:
             return self.edges_path
 
         rels = self.get_relations()
-        rels = sorted(rels, key=lambda r: (r.source_id, r.target_id))
+        rels = sorted(
+            rels, key=lambda r: (r.source_ns, r.source_id, r.target_ns, r.target_id)
+        )
         metadata = sorted(set(key for rel in rels for key in rel.data))
         edge_rows = (
             (
-                rel.source_id,
-                rel.target_id,
+                norm_id(rel.source_ns, rel.source_id),
+                norm_id(rel.target_ns, rel.target_id),
                 "|".join(sorted(rel.labels)),
                 *[rel.data.get(key) for key in metadata],
             )
@@ -153,3 +157,14 @@ def _dump_edges(self) -> Path:
             # Write remaining edges
             edge_writer.writerows(edge_rows)
         return self.edges_path
+
+
+def norm_id(db_ns, db_id):
+    identifiers_ns = identifiers.get_identifiers_ns(db_ns)
+    identifiers_id = db_id
+    ns_embedded = identifiers.identifiers_registry.get(identifiers_ns, {}).get(
+        "namespace_embedded"
+    )
+    if ns_embedded:
+        identifiers_id = identifiers_id[len(identifiers_ns) :]
+    return f"{identifiers_ns}:{identifiers_id}"

From 1eee55aae478188165d4c2f9da525a6a4db5e217 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 15:34:03 -0400
Subject: [PATCH 08/34] Test ID normalization

---
 src/indra_cogex/sources/processor.py | 2 +-
 tests/test_processor.py              | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_processor.py

diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py
index 4eb743ca9..d22408056 100644
--- a/src/indra_cogex/sources/processor.py
+++ b/src/indra_cogex/sources/processor.py
@@ -166,5 +166,5 @@ def norm_id(db_ns, db_id):
         "namespace_embedded"
     )
     if ns_embedded:
-        identifiers_id = identifiers_id[len(identifiers_ns) :]
+        identifiers_id = identifiers_id[len(identifiers_ns) + 1 :]
     return f"{identifiers_ns}:{identifiers_id}"
diff --git a/tests/test_processor.py b/tests/test_processor.py
new file mode 100644
index 000000000..6abaa4c44
--- /dev/null
+++ b/tests/test_processor.py
@@ -0,0 +1,6 @@
+from indra_cogex.sources.processor import norm_id
+
+
+def test_norm_id():
+    assert norm_id("UP", "P12345") == "uniprot:P12345"
+    assert norm_id("CHEBI", "CHEBI:12345") == "chebi:12345"

From 5950634a19a9c540db2c33ecdd6f25ef97e1f946 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 15:54:17 -0400
Subject: [PATCH 09/34] Implement node assembly upon import

---
 src/indra_cogex/sources/cli.py       | 16 +++++++++--
 src/indra_cogex/sources/processor.py | 41 ++++++++++------------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py
index bdc4b0014..8e892d087 100644
--- a/src/indra_cogex/sources/cli.py
+++ b/src/indra_cogex/sources/cli.py
@@ -6,9 +6,12 @@
 from textwrap import dedent
 
 import click
+import pystow
 from more_click import verbose_option
 
 from . import processor_resolver
+from .processor import Processor
+from ..assembly import NodeAssembler
 
 
 @click.command()
@@ -32,6 +35,7 @@
 def main(load: bool, load_only: bool, force: bool):
     """Generate and import Neo4j nodes and edges tables."""
     paths = []
+    na = NodeAssembler()
     for processor_cls in processor_resolver:
         if not processor_cls.importable:
             continue
@@ -44,9 +48,17 @@ def main(load: bool, load_only: bool, force: bool):
             ):
                 click.secho("Processing...", fg="green")
                 processor = processor_cls()
+                na.add_nodes(list(processor.get_nodes()))
                 processor.dump()
         paths.append((processor_cls.nodes_path, processor_cls.edges_path))
 
+    # Now create and dump the assembled nodes
+    assembled_nodes = na.assemble_nodes()
+    assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id))
+    metadata = sorted(set(key for node in assembled_nodes for key in node.data))
+    nodes_path = pystow.module("indra", "cogex", "assembled").join(name="nodes.tsv.gz")
+    Processor._dump_nodes_to_path(assembled_nodes, metadata, nodes_path)
+
     if load or load_only:
         command = dedent(
             """\
@@ -57,8 +69,8 @@ def main(load: bool, load_only: bool, force: bool):
           --skip-bad-relationships=true
         """
         ).rstrip()
-        for node_path, edge_path in paths:
-            command += f"\\\n  --nodes {node_path} \\\n  --relationships {edge_path}"
+        for _, edge_path in paths:
+            command += f"\\\n  --relationships {edge_path}"
 
         click.secho("Running shell command:")
         click.secho(command, fg="blue")
diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py
index d22408056..6d3dc237b 100644
--- a/src/indra_cogex/sources/processor.py
+++ b/src/indra_cogex/sources/processor.py
@@ -80,6 +80,10 @@ def _dump_nodes(self) -> Path:
 
         nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id))
         metadata = sorted(set(key for node in nodes for key in node.data))
+        self._dump_nodes_to_path(nodes, metadata, self.nodes_path, sample_path)
+
+    @staticmethod
+    def _dump_nodes_to_path(nodes, metadata, nodes_path, sample_path=None):
         node_rows = (
             (
                 norm_id(node.db_ns, node.db_id),
@@ -89,38 +93,23 @@ def _dump_nodes(self) -> Path:
             for node in tqdm(nodes, desc="Nodes", unit_scale=True)
         )
 
-        with gzip.open(self.nodes_path, mode="wt") as node_file:
+        with gzip.open(nodes_path, mode="wt") as node_file:
             node_writer = csv.writer(node_file, delimiter="\t")  # type: ignore
-            with sample_path.open("w") as node_sample_file:
-                node_sample_writer = csv.writer(node_sample_file, delimiter="\t")
-
-                header = "id:ID", ":LABEL", *metadata
-                node_sample_writer.writerow(header)
-                node_writer.writerow(header)
+            if sample_path:
+                with sample_path.open("w") as node_sample_file:
+                    node_sample_writer = csv.writer(node_sample_file, delimiter="\t")
 
-                for _, node_row in zip(range(10), node_rows):
-                    node_sample_writer.writerow(node_row)
-                    node_writer.writerow(node_row)
+                    header = "id:ID", ":LABEL", *metadata
+                    node_sample_writer.writerow(header)
+                    node_writer.writerow(header)
 
+                    for _, node_row in zip(range(10), node_rows):
+                        node_sample_writer.writerow(node_row)
+                        node_writer.writerow(node_row)
             # Write remaining nodes
             node_writer.writerows(node_rows)
 
-            # cypher = dedent(f'''\
-            #     CREATE CONSTRAINT ON (n:{ntype}) ASSERT n.id IS UNIQUE;
-            #     USING PERIODIC COMMIT
-            #     LOAD CSV WITH HEADERS FROM "file://{data_path.as_posix()}" AS row FIELDTERMINATOR '\\t'
-            #     MERGE (n:{ntype} {{ id: row.identifier }})
-            #     ''')
-            # if metadata:
-            #     creates = '\n'.join(
-            #         f'n.{key} = row.{key}'
-            #         for key in metadata
-            #     )
-            #     cypher += f'ON CREATE SET {creates}'
-            # with cypher_path.open('w') as file:
-            #     print(cypher, file=file)
-
-        return self.nodes_path
+        return nodes_path
 
     def _dump_edges(self) -> Path:
         sample_path = self.module.join(name="edges_sample.tsv")

From 482ddd71b7d6d1e8319712e8b44b22ded8d09ebe Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 15:57:34 -0400
Subject: [PATCH 10/34] Use new representation in assembler

---
 src/indra_cogex/assembly/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py
index dcf9b88a3..0bc63839b 100644
--- a/src/indra_cogex/assembly/__init__.py
+++ b/src/indra_cogex/assembly/__init__.py
@@ -14,15 +14,15 @@ def add_nodes(self, nodes: List[Node]):
     def assemble_nodes(self) -> List[Node]:
         nodes_by_id = defaultdict(list)
         for node in self.nodes:
-            nodes_by_id[node.identifier].append(node)
+            nodes_by_id[(node.db_ns, node.db_id)].append(node)
 
         assembled_nodes = [
-            self.get_aggregate_node(identifier, node_group)
-            for identifier, node_group in nodes_by_id.items()
+            self.get_aggregate_node(db_ns, db_id, node_group)
+            for (db_ns, db_id), node_group in nodes_by_id.items()
         ]
         return assembled_nodes
 
-    def get_aggregate_node(self, identifier: str, nodes: List[Node]) -> Node:
+    def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node:
         labels = set()
         data = {}
         for node in nodes:
@@ -35,7 +35,7 @@ def get_aggregate_node(self, identifier: str, nodes: List[Node]) -> Node:
                     )
                 else:
                     data[data_key] = data_val
-        return Node(identifier, labels, data)
+        return Node(db_ns, db_id, labels, data)
 
 
 class Conflict:

From e36e9668beb276819be54589669d9f63671d44fd Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 16:29:54 -0400
Subject: [PATCH 11/34] Fix Bgee source identifiers

---
 src/indra_cogex/assembly/__init__.py     |  6 +++---
 src/indra_cogex/representation.py        |  2 ++
 src/indra_cogex/sources/bgee/__init__.py | 15 ++++++++++++---
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py
index 0bc63839b..cd59194a2 100644
--- a/src/indra_cogex/assembly/__init__.py
+++ b/src/indra_cogex/assembly/__init__.py
@@ -1,11 +1,11 @@
 from collections import defaultdict
-from typing import List
+from typing import List, Optional
 from indra_cogex.representation import Node
 
 
 class NodeAssembler:
-    def __init__(self, nodes: List[Node]):
-        self.nodes = nodes
+    def __init__(self, nodes: Optional[List[Node]] = None):
+        self.nodes = nodes if nodes else []
         self.conflicts = []
 
     def add_nodes(self, nodes: List[Node]):
diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py
index 8e24e0a53..64169eeec 100644
--- a/src/indra_cogex/representation.py
+++ b/src/indra_cogex/representation.py
@@ -31,6 +31,8 @@ def __init__(
         data :
             An optional data dictionary associated with the node.
         """
+        if not db_ns or not db_id:
+            raise ValueError("Invalid namespace or ID.")
         self.db_ns = db_ns
         self.db_id = db_id
         self.labels = labels
diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py
index 3983d3161..188cd0e43 100644
--- a/src/indra_cogex/sources/bgee/__init__.py
+++ b/src/indra_cogex/sources/bgee/__init__.py
@@ -32,9 +32,11 @@ def __init__(self, path: Union[None, str, Path] = None):
             self.expressions = pickle.load(fh)
 
     def get_nodes(self):  # noqa:D102
-        for context_id in self.expressions:
+        for context in self.expressions:
+            context_ns, context_id = get_context(context)
             yield Node(
-                *context_id.split(":", maxsplit=1),
+                context_ns,
+                context_id,
                 ["BioEntity"],
                 data={"name": pyobo.get_name_by_curie(context_id)},
             )
@@ -49,8 +51,15 @@ def get_nodes(self):  # noqa:D102
     def get_relations(self):  # noqa:D102
         data = {"source": self.name}
         for context, hgnc_ids in self.expressions.items():
-            context_ns, context_id = context.split(":", maxsplit=1)
+            context_ns, context_id = get_context(context)
             for hgnc_id in hgnc_ids:
                 yield Relation(
                     "HGNC", hgnc_id, context_ns, context_id, [self.rel_type], data
                 )
+
+
+def get_context(context):
+    context_ns, context_id = context.split(":", maxsplit=1)
+    if context_ns == "UBERON":
+        context_id = f"UBERON:{context_id}"
+    return context_ns, context_id

From d6174fbad7d2dda2eb17e26ecc051e3359c9e0b0 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 16:32:59 -0400
Subject: [PATCH 12/34] Fix a corner case for identifiers mapping

---
 src/indra_cogex/sources/processor.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py
index 6d3dc237b..9107fabf4 100644
--- a/src/indra_cogex/sources/processor.py
+++ b/src/indra_cogex/sources/processor.py
@@ -151,9 +151,12 @@ def _dump_edges(self) -> Path:
 def norm_id(db_ns, db_id):
     identifiers_ns = identifiers.get_identifiers_ns(db_ns)
     identifiers_id = db_id
-    ns_embedded = identifiers.identifiers_registry.get(identifiers_ns, {}).get(
-        "namespace_embedded"
-    )
-    if ns_embedded:
-        identifiers_id = identifiers_id[len(identifiers_ns) + 1 :]
+    if not identifiers_ns:
+        identifiers_ns = db_ns.lower()
+    else:
+        ns_embedded = identifiers.identifiers_registry.get(identifiers_ns, {}).get(
+            "namespace_embedded"
+        )
+        if ns_embedded:
+            identifiers_id = identifiers_id[len(identifiers_ns) + 1 :]
     return f"{identifiers_ns}:{identifiers_id}"

From 104e0874870da72bcf131cbe7b86783d732b761f Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 19:20:23 -0400
Subject: [PATCH 13/34] Implement validation and fix Relation str

---
 src/indra_cogex/representation.py        |  5 ++++-
 src/indra_cogex/sources/bgee/__init__.py |  2 ++
 src/indra_cogex/sources/processor.py     | 27 +++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py
index 64169eeec..7b5581d60 100644
--- a/src/indra_cogex/representation.py
+++ b/src/indra_cogex/representation.py
@@ -108,7 +108,10 @@ def to_json(self):
     def __str__(self):  # noqa:D105
         data_str = ", ".join(["%s:'%s'" % (k, v) for k, v in self.data.items()])
         labels_str = ":".join(self.labels)
-        return f"({self.source_id})-[:{labels_str} {data_str}]->" f"({self.target_id})"
+        return (
+            f"({self.source_ns}, {self.source_id})-[:{labels_str} {data_str}]->"
+            f"({self.target_ns}, {self.target_id})"
+        )
 
     def __repr__(self):  # noqa:D105
         return str(self)
diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py
index 188cd0e43..e773490f9 100644
--- a/src/indra_cogex/sources/bgee/__init__.py
+++ b/src/indra_cogex/sources/bgee/__init__.py
@@ -62,4 +62,6 @@ def get_context(context):
     context_ns, context_id = context.split(":", maxsplit=1)
     if context_ns == "UBERON":
         context_id = f"UBERON:{context_id}"
+    elif context_ns == "CL":
+        context_id = f"CL:{context_id}"
     return context_ns, context_id
diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py
index 9107fabf4..ef5c28ee1 100644
--- a/src/indra_cogex/sources/processor.py
+++ b/src/indra_cogex/sources/processor.py
@@ -4,8 +4,8 @@
 
 import csv
 import gzip
+import logging
 from abc import ABC, abstractmethod
-from operator import attrgetter
 from pathlib import Path
 from typing import ClassVar, Iterable
 
@@ -15,6 +15,7 @@
 from tqdm import tqdm
 
 from indra.databases import identifiers
+from indra.statements.validate import assert_valid_db_refs
 
 from indra_cogex.representation import Node, Relation
 
@@ -22,6 +23,7 @@
     "Processor",
 ]
 
+logger = logging.getLogger(__name__)
 
 # deal with importing from wherever with
 #  https://stackoverflow.com/questions/36922843/neo4j-3-x-load-csv-absolute-file-path
@@ -84,6 +86,7 @@ def _dump_nodes(self) -> Path:
 
     @staticmethod
     def _dump_nodes_to_path(nodes, metadata, nodes_path, sample_path=None):
+        nodes = list(validate_nodes(nodes))
         node_rows = (
             (
                 norm_id(node.db_ns, node.db_id),
@@ -117,6 +120,7 @@ def _dump_edges(self) -> Path:
             return self.edges_path
 
         rels = self.get_relations()
+        rels = validate_relations(rels)
         rels = sorted(
             rels, key=lambda r: (r.source_ns, r.source_id, r.target_ns, r.target_id)
         )
@@ -160,3 +164,24 @@ def norm_id(db_ns, db_id):
         if ns_embedded:
             identifiers_id = identifiers_id[len(identifiers_ns) + 1 :]
     return f"{identifiers_ns}:{identifiers_id}"
+
+
+def validate_nodes(nodes):
+    for idx, node in enumerate(nodes):
+        try:
+            assert_valid_db_refs({node.db_ns: node.db_id})
+            yield node
+        except Exception as e:
+            logger.info(f"{idx}: {node} - {e}")
+            continue
+
+
+def validate_relations(relations):
+    for idx, rel in enumerate(relations):
+        try:
+            assert_valid_db_refs({rel.source_ns: rel.source_id})
+            assert_valid_db_refs({rel.target_ns: rel.target_id})
+            yield rel
+        except Exception as e:
+            logger.info(f"{idx}: {rel} - {e}")
+            continue

From 877e41d0dfd425625854f3a3fc34c4026c70f542 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 19:23:45 -0400
Subject: [PATCH 14/34] Fix GO node construction

---
 src/indra_cogex/sources/goa/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/indra_cogex/sources/goa/__init__.py b/src/indra_cogex/sources/goa/__init__.py
index bff2452ea..a97155d88 100644
--- a/src/indra_cogex/sources/goa/__init__.py
+++ b/src/indra_cogex/sources/goa/__init__.py
@@ -42,7 +42,7 @@ def __init__(self):
 
     def get_nodes(self):  # noqa:D102
         for go_node in self.df["GO_ID"].unique():
-            yield Node(go_node, ["BioEntity"])
+            yield Node("GO", go_node, ["BioEntity"])
         for hgnc_id in self.df["HGNC_ID"].unique():
             yield Node("HGNC", hgnc_id, ["BioEntity"])
 

From 857d4e40e0b2fa2b362a3d00f7f2fc75de2e5634 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 19:35:31 -0400
Subject: [PATCH 15/34] Fix pathways prefix and add main

---
 src/indra_cogex/sources/pathways/__init__.py | 6 +-----
 src/indra_cogex/sources/pathways/__main__.py | 9 +++++++++
 2 files changed, 10 insertions(+), 5 deletions(-)
 create mode 100644 src/indra_cogex/sources/pathways/__main__.py

diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py
index 770073a48..5f7c4111d 100644
--- a/src/indra_cogex/sources/pathways/__init__.py
+++ b/src/indra_cogex/sources/pathways/__init__.py
@@ -33,7 +33,7 @@ class PyoboProcessor(Processor):
     def get_nodes(self):  # noqa:D102
         # TODO add license
         version = pyobo.api.utils.get_version(self.prefix)
-        for identifier, name in pyobo.get_id_name_mapping("wikipathways").items():
+        for identifier, name in pyobo.get_id_name_mapping(self.prefix).items():
             db_ns, db_id = get_ns_id_from_identifiers(self.prefix, identifier)
             yield Node(
                 db_ns,
@@ -92,7 +92,3 @@ class ReactomeProcessor(PyoboProcessor):
     relation = has_part
     relation_label = "haspart"
     importable = True
-
-
-if __name__ == "__main__":
-    WikipathwaysProcessor.cli()
diff --git a/src/indra_cogex/sources/pathways/__main__.py b/src/indra_cogex/sources/pathways/__main__.py
new file mode 100644
index 000000000..2e9b46bff
--- /dev/null
+++ b/src/indra_cogex/sources/pathways/__main__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+"""Run the pathways processor using ``python -m indra_cogex.sources.pathways``."""
+
+from . import ReactomeProcessor, WikipathwaysProcessor
+
+if __name__ == "__main__":
+    ReactomeProcessor.cli()
+    WikipathwaysProcessor.cli()

From 259d75acaaf3a0fe2aebcd8db2e4280cd9ce922e Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 19:40:01 -0400
Subject: [PATCH 16/34] Handle UP isoforms

---
 src/indra_cogex/sources/pathways/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py
index 5f7c4111d..88eab72bf 100644
--- a/src/indra_cogex/sources/pathways/__init__.py
+++ b/src/indra_cogex/sources/pathways/__init__.py
@@ -66,6 +66,10 @@ def get_gene(self, prefix, identifier):
             else:
                 return "EGID", identifier
         elif prefix == "uniprot":
+            # Some of the UniProt IDs are isoforms, for now, we just strip
+            # these off. We could do something more principled later.
+            if "-" in identifier:
+                identifier, _ = identifier.split("-")
             hgnc_id = uniprot_client.get_hgnc_id(identifier)
             if hgnc_id:
                 return "HGNC", hgnc_id

From 58b3b9235969de303c1a1d832e363a78c8ab281b Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 21:17:09 -0400
Subject: [PATCH 17/34] Implement ID fixing for SIF dump

---
 src/indra_cogex/sources/indra_db/__init__.py | 24 +++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py
index a943f965d..d36dbe2a5 100644
--- a/src/indra_cogex/sources/indra_db/__init__.py
+++ b/src/indra_cogex/sources/indra_db/__init__.py
@@ -2,9 +2,11 @@
 
 """Processor for the INDRA database."""
 
+import json
 import logging
 import pickle
 from pathlib import Path
+from tqdm import tqdm
 from typing import Union
 
 import humanize
@@ -12,10 +14,12 @@
 import pystow
 
 from indra.ontology.bio import bio_ontology
+from indra.databases.identifiers import ensure_prefix_if_needed
 from indra_cogex.representation import Node, Relation
 from indra_cogex.sources.processor import Processor
 
 logger = logging.getLogger(__name__)
+tqdm.pandas()
 
 
 # If you don't have the data, get it from:
@@ -41,6 +45,7 @@ def __init__(self, path: Union[None, str, Path] = None):
             df = pickle.load(fh)
         logger.info("Loaded %s rows from %s", humanize.intword(len(df)), path)
         self.df = df
+        logger.info("Fixing ID and naming issues...")
         for side in "AB":
             # A lot of the names in the SIF dump are all over
             self.df[f"ag{side}_name"] = [
@@ -49,15 +54,19 @@ def __init__(self, path: Union[None, str, Path] = None):
                     [f"ag{side}_ns", f"ag{side}_id"]
                 ].values
             ]
+            self.df[f"ag{side}_id"] = self.df.progress_apply(
+                lambda row: fix_id(row[f"ag{side}_ns"], row[f"ag{side}_id"]), axis=1
+            )
+        self.df["source_counts"] = self.df["source_counts"].apply(json.dumps)
 
     def get_nodes(self):  # noqa:D102
         df = pd.concat(
             [
                 self.df[["agA_ns", "agA_id", "agA_name"]].rename(
-                    {"agA_ns": "ns", "agA_id": "id", "agA_name": "name"}
+                    columns={"agA_ns": "ns", "agA_id": "id", "agA_name": "name"}
                 ),
                 self.df[["agB_ns", "agB_id", "agB_name"]].rename(
-                    {"agB_ns": "ns", "agB_id": "id", "agB_name": "name"}
+                    columns={"agB_ns": "ns", "agB_id": "id", "agB_name": "name"}
                 ),
             ],
             ignore_index=True,
@@ -86,7 +95,7 @@ def get_relations(self):  # noqa:D102
         ) in (
             self.df[columns].drop_duplicates().values
         ):
-            data = {"stmt_hash:long": stmt_hash, "evidence_count:str": source_counts}
+            data = {"stmt_hash:long": stmt_hash, "evidence_count:string": source_counts}
             yield Relation(
                 source_ns,
                 source_id,
@@ -95,3 +104,12 @@ def get_relations(self):  # noqa:D102
                 [stmt_type, "Statement"],
                 data,
             )
+
+
+def fix_id(db_ns, db_id):
+    if db_ns == "GO":
+        if db_id.isnumeric():
+            db_id = "0" * (7 - len(db_id)) + db_id
+
+    db_id = ensure_prefix_if_needed(db_ns, db_id)
+    return db_id

From 86df4c2aaa66fbf3c6cbcd104211c0e9894afb57 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 21:26:32 -0400
Subject: [PATCH 18/34] Improve the import approach, still issues to fix

---
 src/indra_cogex/sources/cli.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py
index 8e892d087..007c96744 100644
--- a/src/indra_cogex/sources/cli.py
+++ b/src/indra_cogex/sources/cli.py
@@ -48,16 +48,21 @@ def main(load: bool, load_only: bool, force: bool):
             ):
                 click.secho("Processing...", fg="green")
                 processor = processor_cls()
+                # FIXME: this is redundant, we get nodes twice
                 na.add_nodes(list(processor.get_nodes()))
                 processor.dump()
         paths.append((processor_cls.nodes_path, processor_cls.edges_path))
 
-    # Now create and dump the assembled nodes
-    assembled_nodes = na.assemble_nodes()
-    assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id))
-    metadata = sorted(set(key for node in assembled_nodes for key in node.data))
+    # FIXME: This doesn't work unless the processors are also running and
+    # getting nodes
     nodes_path = pystow.module("indra", "cogex", "assembled").join(name="nodes.tsv.gz")
-    Processor._dump_nodes_to_path(assembled_nodes, metadata, nodes_path)
+    if not load_only:
+        if force or not nodes_path.is_file():
+            # Now create and dump the assembled nodes
+            assembled_nodes = na.assemble_nodes()
+            assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id))
+            metadata = sorted(set(key for node in assembled_nodes for key in node.data))
+            Processor._dump_nodes_to_path(assembled_nodes, metadata, nodes_path)
 
     if load or load_only:
         command = dedent(
@@ -66,8 +71,10 @@ def main(load: bool, load_only: bool, force: bool):
           --database=indra \\
           --delimiter='TAB' \\
           --skip-duplicate-nodes=true \\
-          --skip-bad-relationships=true
+          --skip-bad-relationships=true \\
+          --nodes %s
         """
+            % nodes_path
         ).rstrip()
         for _, edge_path in paths:
             command += f"\\\n  --relationships {edge_path}"

From 1d57332bd5105771e5770edc5d2a1a8bd1686dc5 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 21:28:41 -0400
Subject: [PATCH 19/34] Switch to f-string

---
 src/indra_cogex/sources/cli.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py
index 007c96744..a4767572e 100644
--- a/src/indra_cogex/sources/cli.py
+++ b/src/indra_cogex/sources/cli.py
@@ -66,15 +66,14 @@ def main(load: bool, load_only: bool, force: bool):
 
     if load or load_only:
         command = dedent(
-            """\
+            f"""\
         neo4j-admin import \\
           --database=indra \\
           --delimiter='TAB' \\
           --skip-duplicate-nodes=true \\
           --skip-bad-relationships=true \\
-          --nodes %s
+          --nodes {nodes_path}
         """
-            % nodes_path
         ).rstrip()
         for _, edge_path in paths:
             command += f"\\\n  --relationships {edge_path}"

From 3b7b2cf1d95c63ea9dca2f3b3a7dab636b9672e4 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 21:47:05 -0400
Subject: [PATCH 20/34] Fix assembly code

---
 src/indra_cogex/assembly/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py
index cd59194a2..d5456171c 100644
--- a/src/indra_cogex/assembly/__init__.py
+++ b/src/indra_cogex/assembly/__init__.py
@@ -26,12 +26,12 @@ def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node:
         labels = set()
         data = {}
         for node in nodes:
-            labels |= node.labels
+            labels |= set(node.labels)
             for data_key, data_val in node.data.items():
                 previous_val = data.get(data_key)
                 if previous_val and previous_val != data_val:
                     self.conflicts.append(
-                        Conflict(f"{data_key}:{previous_val}"), f"{data_key}:{data_val}"
+                        Conflict(f"{data_key}:{previous_val}", f"{data_key}:{data_val}")
                     )
                 else:
                     data[data_key] = data_val

From 434f9a0947c691ab882c6442371c0432b94d22f7 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 21:57:30 -0400
Subject: [PATCH 21/34] Change implementation of conflicts and test

---
 src/indra_cogex/assembly/__init__.py | 14 +++++-----
 tests/test_assembly.py               | 40 ++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_assembly.py

diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py
index d5456171c..165a7fe8c 100644
--- a/src/indra_cogex/assembly/__init__.py
+++ b/src/indra_cogex/assembly/__init__.py
@@ -30,21 +30,21 @@ def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node:
             for data_key, data_val in node.data.items():
                 previous_val = data.get(data_key)
                 if previous_val and previous_val != data_val:
-                    self.conflicts.append(
-                        Conflict(f"{data_key}:{previous_val}", f"{data_key}:{data_val}")
-                    )
+                    self.conflicts.append(Conflict(data_key, previous_val, data_val))
                 else:
                     data[data_key] = data_val
+        labels = sorted(labels)
         return Node(db_ns, db_id, labels, data)
 
 
 class Conflict:
-    def __init__(self, first, second):
-        self.first = first
-        self.second = second
+    def __init__(self, key, val1, val2):
+        self.key = key
+        self.val1 = val1
+        self.val2 = val2
 
     def __repr__(self):
         return str(self)
 
     def __str__(self):
-        return f"Conflict({self.first}, {self.second})"
+        return f"Conflict({self.key}, {self.val1}, {self.val2})"
diff --git a/tests/test_assembly.py b/tests/test_assembly.py
new file mode 100644
index 000000000..e6185c38d
--- /dev/null
+++ b/tests/test_assembly.py
@@ -0,0 +1,40 @@
+from indra_cogex.assembly import NodeAssembler
+from indra_cogex.representation import Node
+
+
+def test_add_nodes():
+    na = NodeAssembler([Node("x", "y", ["l"])])
+    assert len(na.nodes) == 1
+    na.add_nodes([Node("y", "z", ["l"])])
+    assert len(na.nodes) == 2
+
+
+def test_merge_properties():
+    n1 = Node("ns", "id", ["l"], {"k1": "v1"})
+    n2 = Node("ns", "id", ["l"], {"k2": "v2"})
+    na = NodeAssembler([n1, n2])
+    ans = na.assemble_nodes()
+    assert len(ans) == 1
+    assert ans[0].data == {"k1": "v1", "k2": "v2"}
+
+
+def test_merge_labels():
+    n1 = Node("ns", "id", ["l1", "l2"])
+    n2 = Node("ns", "id", ["l2", "l3"])
+    na = NodeAssembler([n1, n2])
+    ans = na.assemble_nodes()
+    assert len(ans) == 1
+    assert set(ans[0].labels) == {"l1", "l2", "l3"}
+
+
+def test_merge_conflict():
+    n1 = Node("ns", "id", ["l"], {"k1": "v1"})
+    n2 = Node("ns", "id", ["l"], {"k1": "v2"})
+    na = NodeAssembler([n1, n2])
+    ans = na.assemble_nodes()
+    assert len(ans) == 1
+    assert ans[0].data == {"k1": "v1"}
+    assert len(na.conflicts) == 1
+    assert na.conflicts[0].key == "k1"
+    assert na.conflicts[0].val1 == "v1"
+    assert na.conflicts[0].val2 == "v2"

From b890977d40e2c3b5fb2eaf20c0cfc9e4e4e248a7 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Thu, 13 May 2021 22:11:36 -0400
Subject: [PATCH 22/34] Fix assembled node dumping

---
 src/indra_cogex/sources/cli.py       |  3 +--
 src/indra_cogex/sources/processor.py | 12 +++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py
index a4767572e..f29759ae7 100644
--- a/src/indra_cogex/sources/cli.py
+++ b/src/indra_cogex/sources/cli.py
@@ -61,8 +61,7 @@ def main(load: bool, load_only: bool, force: bool):
             # Now create and dump the assembled nodes
             assembled_nodes = na.assemble_nodes()
             assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id))
-            metadata = sorted(set(key for node in assembled_nodes for key in node.data))
-            Processor._dump_nodes_to_path(assembled_nodes, metadata, nodes_path)
+            Processor._dump_nodes_to_path(assembled_nodes, nodes_path)
 
     if load or load_only:
         command = dedent(
diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py
index ef5c28ee1..8af4aec80 100644
--- a/src/indra_cogex/sources/processor.py
+++ b/src/indra_cogex/sources/processor.py
@@ -81,12 +81,12 @@ def _dump_nodes(self) -> Path:
             return self.nodes_path
 
         nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id))
-        metadata = sorted(set(key for node in nodes for key in node.data))
-        self._dump_nodes_to_path(nodes, metadata, self.nodes_path, sample_path)
+        self._dump_nodes_to_path(nodes, self.nodes_path, sample_path)
 
     @staticmethod
-    def _dump_nodes_to_path(nodes, metadata, nodes_path, sample_path=None):
+    def _dump_nodes_to_path(nodes, nodes_path, sample_path=None):
         nodes = list(validate_nodes(nodes))
+        metadata = sorted(set(key for node in nodes for key in node.data))
         node_rows = (
             (
                 norm_id(node.db_ns, node.db_id),
@@ -96,16 +96,14 @@ def _dump_nodes_to_path(nodes, metadata, nodes_path, sample_path=None):
             for node in tqdm(nodes, desc="Nodes", unit_scale=True)
         )
 
+        header = "id:ID", ":LABEL", *metadata
         with gzip.open(nodes_path, mode="wt") as node_file:
             node_writer = csv.writer(node_file, delimiter="\t")  # type: ignore
+            node_writer.writerow(header)
             if sample_path:
                 with sample_path.open("w") as node_sample_file:
                     node_sample_writer = csv.writer(node_sample_file, delimiter="\t")
-
-                    header = "id:ID", ":LABEL", *metadata
                     node_sample_writer.writerow(header)
-                    node_writer.writerow(header)
-
                     for _, node_row in zip(range(10), node_rows):
                         node_sample_writer.writerow(node_row)
                         node_writer.writerow(node_row)

From a93c41a6faaec376a229ebab8dbce6b06facf97d Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Fri, 14 May 2021 00:40:32 -0400
Subject: [PATCH 23/34] Change labels to rel_type and fix node label separator

---
 src/indra_cogex/representation.py                  | 11 +++++------
 src/indra_cogex/sources/bgee/__init__.py           |  2 +-
 src/indra_cogex/sources/goa/__init__.py            |  2 +-
 src/indra_cogex/sources/indra_db/__init__.py       |  4 ++--
 src/indra_cogex/sources/indra_ontology/__init__.py |  4 +---
 src/indra_cogex/sources/pathways/__init__.py       |  2 +-
 src/indra_cogex/sources/processor.py               |  4 ++--
 7 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py
index 7b5581d60..4d3000413 100644
--- a/src/indra_cogex/representation.py
+++ b/src/indra_cogex/representation.py
@@ -77,21 +77,21 @@ def __init__(
         source_id: str,
         target_ns: str,
         target_id: str,
-        labels: Collection[str],
+        rel_type: str,
         data: Optional[Mapping[str, Any]] = None,
     ):
         """Initialize the relation.
 
         :param source_id: The identifier of the source node
         :param target_id: The identifier of the target node
-        :param labels: The collection of labels for the relation.
+        :param rel_type: The relation's type.
         :param data: The optional data dictionary associated with the relation.
         """
         self.source_ns = source_ns
         self.source_id = source_id
         self.target_ns = target_ns
         self.target_id = target_id
-        self.labels = list(labels)
+        self.rel_type = rel_type
         self.data = data if data else {}
 
     def to_json(self):
@@ -101,15 +101,14 @@ def to_json(self):
             "source_id": self.source_id,
             "target_ns": self.target_ns,
             "target_id": self.target_id,
-            "labels": self.labels,
+            "rel_type": self.rel_type,
             "data": self.data,
         }
 
     def __str__(self):  # noqa:D105
         data_str = ", ".join(["%s:'%s'" % (k, v) for k, v in self.data.items()])
-        labels_str = ":".join(self.labels)
         return (
-            f"({self.source_ns}, {self.source_id})-[:{labels_str} {data_str}]->"
+            f"({self.source_ns}, {self.source_id})-[:{self.rel_type} {data_str}]->"
             f"({self.target_ns}, {self.target_id})"
         )
 
diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py
index e773490f9..79dae97ce 100644
--- a/src/indra_cogex/sources/bgee/__init__.py
+++ b/src/indra_cogex/sources/bgee/__init__.py
@@ -54,7 +54,7 @@ def get_relations(self):  # noqa:D102
             context_ns, context_id = get_context(context)
             for hgnc_id in hgnc_ids:
                 yield Relation(
-                    "HGNC", hgnc_id, context_ns, context_id, [self.rel_type], data
+                    "HGNC", hgnc_id, context_ns, context_id, self.rel_type, data
                 )
 
 
diff --git a/src/indra_cogex/sources/goa/__init__.py b/src/indra_cogex/sources/goa/__init__.py
index a97155d88..44ca831ed 100644
--- a/src/indra_cogex/sources/goa/__init__.py
+++ b/src/indra_cogex/sources/goa/__init__.py
@@ -52,7 +52,7 @@ def get_relations(self):  # noqa:D102
             all_ecs = ",".join(sorted(set(ecs)))
             # Possible properties could be e.g., evidence codes
             data = {"evidence_codes:string": all_ecs, "source": self.name}
-            yield Relation("HGNC", hgnc_id, "GO", go_id, [rel_type], data)
+            yield Relation("HGNC", hgnc_id, "GO", go_id, rel_type, data)
 
 
 def load_goa(url: str) -> pd.DataFrame:
diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py
index d36dbe2a5..01ebf9a0a 100644
--- a/src/indra_cogex/sources/indra_db/__init__.py
+++ b/src/indra_cogex/sources/indra_db/__init__.py
@@ -95,13 +95,13 @@ def get_relations(self):  # noqa:D102
         ) in (
             self.df[columns].drop_duplicates().values
         ):
-            data = {"stmt_hash:long": stmt_hash, "evidence_count:string": source_counts}
+            data = {"stmt_hash:long": stmt_hash, "source_counts:string": source_counts}
             yield Relation(
                 source_ns,
                 source_id,
                 target_ns,
                 target_id,
-                [stmt_type, "Statement"],
+                stmt_type,
                 data,
             )
 
diff --git a/src/indra_cogex/sources/indra_ontology/__init__.py b/src/indra_cogex/sources/indra_ontology/__init__.py
index 8f69957f1..b980dcbbb 100644
--- a/src/indra_cogex/sources/indra_ontology/__init__.py
+++ b/src/indra_cogex/sources/indra_ontology/__init__.py
@@ -43,6 +43,4 @@ def get_relations(self):  # noqa:D102
             target_ns, target_id = self.ontology.get_ns_id(target)
             data = copy.copy(data)
             edge_type = data.pop("type")
-            yield Relation(
-                source_ns, source_id, target_ns, target_id, [edge_type], data
-            )
+            yield Relation(source_ns, source_id, target_ns, target_id, edge_type, data)
diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py
index 88eab72bf..fb4e59ade 100644
--- a/src/indra_cogex/sources/pathways/__init__.py
+++ b/src/indra_cogex/sources/pathways/__init__.py
@@ -54,7 +54,7 @@ def get_relations(self):  # noqa:D102
                 pathway_id,
                 gene_ns,
                 gene_id,
-                [self.relation_label],
+                self.relation_label,
                 dict(source=self.name),
             )
 
diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py
index 8af4aec80..3926a2920 100644
--- a/src/indra_cogex/sources/processor.py
+++ b/src/indra_cogex/sources/processor.py
@@ -90,7 +90,7 @@ def _dump_nodes_to_path(nodes, nodes_path, sample_path=None):
         node_rows = (
             (
                 norm_id(node.db_ns, node.db_id),
-                "|".join(node.labels),
+                ";".join(node.labels),
                 *[node.data.get(key, "") for key in metadata],
             )
             for node in tqdm(nodes, desc="Nodes", unit_scale=True)
@@ -127,7 +127,7 @@ def _dump_edges(self) -> Path:
             (
                 norm_id(rel.source_ns, rel.source_id),
                 norm_id(rel.target_ns, rel.target_id),
-                "|".join(sorted(rel.labels)),
+                rel.rel_type,
                 *[rel.data.get(key) for key in metadata],
             )
             for rel in tqdm(rels, desc="Edges", unit_scale=True)

From 84fa3f4c368fe87d710d41e1f85846f12499ad0d Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Fri, 14 May 2021 10:52:44 -0400
Subject: [PATCH 24/34] Fix more ID issues on import

---
 src/indra_cogex/sources/indra_db/__init__.py | 9 ++++++++-
 tests/test_indra_db.py                       | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_indra_db.py

diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py
index 01ebf9a0a..788433cea 100644
--- a/src/indra_cogex/sources/indra_db/__init__.py
+++ b/src/indra_cogex/sources/indra_db/__init__.py
@@ -110,6 +110,13 @@ def fix_id(db_ns, db_id):
     if db_ns == "GO":
         if db_id.isnumeric():
             db_id = "0" * (7 - len(db_id)) + db_id
-
+    if db_ns == "EFO" and db_id.startswith("EFO:"):
+        db_id = db_id[4:]
+    # FIXME: we need to be able to fix namespace as well, not just IDs,
+    # requires refactoring
+    # if db_ns == 'UP' and db_id.startswith('SL'):
+    #    db_ns = 'UPLOC'
+    if db_ns == "UP" and "-" in db_id and not db_id.startswith("SL-"):
+        db_id = db_id.split("-")[0]
     db_id = ensure_prefix_if_needed(db_ns, db_id)
     return db_id
diff --git a/tests/test_indra_db.py b/tests/test_indra_db.py
new file mode 100644
index 000000000..0491be953
--- /dev/null
+++ b/tests/test_indra_db.py
@@ -0,0 +1,8 @@
+from indra_cogex.sources.indra_db import fix_id
+
+
+def test_fix_id():
+    assert fix_id("EFO", "EFO:12345") == "12345"
+    assert fix_id("GO", "123") == "GO:0000123"
+    assert fix_id("CHEBI", "123") == "CHEBI:123"
+    assert fix_id("UP", "P12345-6") == "P12345"

From ea9286c873465a1521d86a94b803be9304e2ca5b Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Fri, 14 May 2021 17:26:08 -0400
Subject: [PATCH 25/34] Update meta

---
 .flake8 | 2 +-
 tox.ini | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.flake8 b/.flake8
index a711387bb..09307d270 100644
--- a/.flake8
+++ b/.flake8
@@ -7,7 +7,7 @@ ignore =
     S403 # pickle
     S301 # pickle
     W503 # line break before binary operator
-    E203 # conflicts with black
+    S101 # Don't complain about asserts
 exclude =
     .tox,
     .git,
diff --git a/tox.ini b/tox.ini
index 7273319ac..5a530b75f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -79,7 +79,7 @@ description = Run the flake8 tool with several plugins (bandit, docstrings, impo
 [testenv:black]
 deps = black
 skip_install = true
-commands = black src/ tests/ setup.py
+commands = black src/indra_cogex tests/ setup.py
 description = Run the black tool
 
 [testenv:mypy]

From 5571b78266f65e60e58fc0a239905934d3e6c4a8 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sat, 15 May 2021 22:20:24 -0400
Subject: [PATCH 26/34] Generalize ID fixing to name spaces

---
 src/indra_cogex/sources/indra_db/__init__.py | 26 +++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py
index 788433cea..01fd12e1e 100644
--- a/src/indra_cogex/sources/indra_db/__init__.py
+++ b/src/indra_cogex/sources/indra_db/__init__.py
@@ -7,7 +7,7 @@
 import pickle
 from pathlib import Path
 from tqdm import tqdm
-from typing import Union
+from typing import Tuple, Union
 
 import humanize
 import pandas as pd
@@ -42,6 +42,7 @@ def __init__(self, path: Union[None, str, Path] = None):
         elif isinstance(path, str):
             path = Path(path)
         with open(path, "rb") as fh:
+            logger.info("Loading %s" % path)
             df = pickle.load(fh)
         logger.info("Loaded %s rows from %s", humanize.intword(len(df)), path)
         self.df = df
@@ -54,8 +55,16 @@ def __init__(self, path: Union[None, str, Path] = None):
                     [f"ag{side}_ns", f"ag{side}_id"]
                 ].values
             ]
-            self.df[f"ag{side}_id"] = self.df.progress_apply(
-                lambda row: fix_id(row[f"ag{side}_ns"], row[f"ag{side}_id"]), axis=1
+            breakpoint()
+            self.df[f"ag{side}_ns"], self.df[f"ag{side}_id"] = list(
+                zip(
+                    *[
+                        fix_id(db_ns, db_id)
+                        for db_ns, db_id in tqdm(
+                            zip(list(df[f"ag{side}_ns"]), list(df[f"ag{side}_id"]))
+                        )
+                    ]
+                )
             )
         self.df["source_counts"] = self.df["source_counts"].apply(json.dumps)
 
@@ -106,17 +115,16 @@ def get_relations(self):  # noqa:D102
             )
 
 
-def fix_id(db_ns, db_id):
+def fix_id(db_ns: str, db_id: str) -> Tuple[str, str]:
+    """Fix ID issues specific to the SIF dump."""
     if db_ns == "GO":
         if db_id.isnumeric():
             db_id = "0" * (7 - len(db_id)) + db_id
     if db_ns == "EFO" and db_id.startswith("EFO:"):
         db_id = db_id[4:]
-    # FIXME: we need to be able to fix namespace as well, not just IDs,
-    # requires refactoring
-    # if db_ns == 'UP' and db_id.startswith('SL'):
-    #    db_ns = 'UPLOC'
+    if db_ns == "UP" and db_id.startswith("SL"):
+        db_ns = "UPLOC"
     if db_ns == "UP" and "-" in db_id and not db_id.startswith("SL-"):
         db_id = db_id.split("-")[0]
     db_id = ensure_prefix_if_needed(db_ns, db_id)
-    return db_id
+    return db_ns, db_id

From ba185df80d09382e67e0fbe5a8f8a52f3a719a70 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sat, 15 May 2021 22:32:27 -0400
Subject: [PATCH 27/34] Add total and description

---
 src/indra_cogex/sources/indra_db/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py
index 01fd12e1e..5771e0e8b 100644
--- a/src/indra_cogex/sources/indra_db/__init__.py
+++ b/src/indra_cogex/sources/indra_db/__init__.py
@@ -55,13 +55,14 @@ def __init__(self, path: Union[None, str, Path] = None):
                     [f"ag{side}_ns", f"ag{side}_id"]
                 ].values
             ]
-            breakpoint()
             self.df[f"ag{side}_ns"], self.df[f"ag{side}_id"] = list(
                 zip(
                     *[
                         fix_id(db_ns, db_id)
                         for db_ns, db_id in tqdm(
-                            zip(list(df[f"ag{side}_ns"]), list(df[f"ag{side}_id"]))
+                            zip(list(df[f"ag{side}_ns"]), list(df[f"ag{side}_id"])),
+                            total=len(df),
+                            desc="Fixing IDs",
                         )
                     ]
                 )

From 3538529117aebbfad6f42066d97440206b34948e Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sat, 15 May 2021 22:49:26 -0400
Subject: [PATCH 28/34] Fix mypy issues

---
 src/indra_cogex/assembly/__init__.py | 9 ++++-----
 src/indra_cogex/sources/processor.py | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py
index 165a7fe8c..9d6f83306 100644
--- a/src/indra_cogex/assembly/__init__.py
+++ b/src/indra_cogex/assembly/__init__.py
@@ -1,12 +1,12 @@
 from collections import defaultdict
 from typing import List, Optional
-from indra_cogex.representation import Node
+from indra_cogex.representation import Dict, Node
 
 
 class NodeAssembler:
     def __init__(self, nodes: Optional[List[Node]] = None):
         self.nodes = nodes if nodes else []
-        self.conflicts = []
+        self.conflicts: List[Conflict] = []
 
     def add_nodes(self, nodes: List[Node]):
         self.nodes += nodes
@@ -24,7 +24,7 @@ def assemble_nodes(self) -> List[Node]:
 
     def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node:
         labels = set()
-        data = {}
+        data: Dict[str, str] = {}
         for node in nodes:
             labels |= set(node.labels)
             for data_key, data_val in node.data.items():
@@ -33,8 +33,7 @@ def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node:
                     self.conflicts.append(Conflict(data_key, previous_val, data_val))
                 else:
                     data[data_key] = data_val
-        labels = sorted(labels)
-        return Node(db_ns, db_id, labels, data)
+        return Node(db_ns, db_id, sorted(labels), data)
 
 
 class Conflict:
diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py
index 3926a2920..1d4d57974 100644
--- a/src/indra_cogex/sources/processor.py
+++ b/src/indra_cogex/sources/processor.py
@@ -81,7 +81,7 @@ def _dump_nodes(self) -> Path:
             return self.nodes_path
 
         nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id))
-        self._dump_nodes_to_path(nodes, self.nodes_path, sample_path)
+        return self._dump_nodes_to_path(nodes, self.nodes_path, sample_path)
 
     @staticmethod
     def _dump_nodes_to_path(nodes, nodes_path, sample_path=None):

From c3802d2751348ac4b704296b01b95ee4c5fe1d68 Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sat, 15 May 2021 22:51:29 -0400
Subject: [PATCH 29/34] Just test the code

---
 .github/workflows/tests.yml | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index bb7d41c21..318e9b33e 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -3,43 +3,6 @@ name: Tests
 on: [ push, pull_request ]
 
 jobs:
-  lint:
-    name: Lint
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [ 3.6, 3.9 ]
-        tox-env: [ manifest, flake8, pyroma, mypy ]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: pip install tox
-      - name: Run tox
-        run: tox -e ${{ matrix.tox-env }}
-  docs:
-    name: Documentation
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [ 3.6, 3.9 ]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: pip install tox
-      - name: Check RST conformity with doc8
-        run: tox -e doc8
-      # - name: Check docstring coverage
-      #   run: tox -e docstr-coverage
-      - name: Check documentation build with Sphinx
-        run: tox -e docs
   tests:
     name: Tests
     runs-on: ${{ matrix.os }}

From 736550e2aa111abeb0a0c939573a3bcd4585ecec Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Sun, 16 May 2021 16:52:56 -0400
Subject: [PATCH 30/34] Address issue with install

I thought the `__init__.py` wasn't required anymore but maybe it still is
---
 src/indra_cogex/__init__.py | 0
 tox.ini                     | 2 --
 2 files changed, 2 deletions(-)
 create mode 100644 src/indra_cogex/__init__.py

diff --git a/src/indra_cogex/__init__.py b/src/indra_cogex/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tox.ini b/tox.ini
index 5a530b75f..b4bcc7215 100644
--- a/tox.ini
+++ b/tox.ini
@@ -28,8 +28,6 @@ passenv =
 deps =
     coverage
     pytest
-extras =
-    pandas
 whitelist_externals =
     /bin/cat
     /bin/cp

From 7d876c0dae73f93738cc2b99c3004a352c32748f Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Sun, 16 May 2021 16:53:07 -0400
Subject: [PATCH 31/34] Fix import

---
 src/indra_cogex/assembly/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py
index 9d6f83306..a81b6489a 100644
--- a/src/indra_cogex/assembly/__init__.py
+++ b/src/indra_cogex/assembly/__init__.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
-from typing import List, Optional
-from indra_cogex.representation import Dict, Node
+from typing import Dict, List, Optional
+
+from indra_cogex.representation import Node
 
 
 class NodeAssembler:

From be3300ec7e92a9c8e85607a9c3c69a0bb008ee25 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Sun, 16 May 2021 16:53:19 -0400
Subject: [PATCH 32/34] Add pyobo as requirement

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index a30697e58..81a072de1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -42,6 +42,7 @@ install_requires =
     more_click
     class-resolver>=0.0.9
     pystow>=0.1.6
+    pyobo
 
 include_package_data = True
 python_requires = >=3.6

From 983d576fb28e2fa355e132b9937bdf07c78b035c Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 16 May 2021 21:33:44 -0400
Subject: [PATCH 33/34] Install INDRA from github

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 81a072de1..e05db7001 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,7 +36,7 @@ keywords =
 
 [options]
 install_requires =
-    indra
+    indra @ git+https://github.com/sorgerlab/indra.git
     neo4j
     click
     more_click

From e7bb294570ba7fa46d6c0e524c5e50e36b429d4f Mon Sep 17 00:00:00 2001
From: Ben Gyori <ben.gyori@gmail.com>
Date: Sun, 16 May 2021 21:54:05 -0400
Subject: [PATCH 34/34] Fix ID fixing test

---
 tests/test_indra_db.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/test_indra_db.py b/tests/test_indra_db.py
index 0491be953..9172c1e45 100644
--- a/tests/test_indra_db.py
+++ b/tests/test_indra_db.py
@@ -2,7 +2,8 @@
 
 
 def test_fix_id():
-    assert fix_id("EFO", "EFO:12345") == "12345"
-    assert fix_id("GO", "123") == "GO:0000123"
-    assert fix_id("CHEBI", "123") == "CHEBI:123"
-    assert fix_id("UP", "P12345-6") == "P12345"
+    assert fix_id("EFO", "EFO:12345") == ("EFO", "12345")
+    assert fix_id("GO", "123") == ("GO", "GO:0000123")
+    assert fix_id("CHEBI", "123") == ("CHEBI", "CHEBI:123")
+    assert fix_id("UP", "P12345-6") == ("UP", "P12345")
+    assert fix_id("UP", "SL-123") == ("UPLOC", "SL-123")