diff --git a/.flake8 b/.flake8
index a711387bb..09307d270 100644
--- a/.flake8
+++ b/.flake8
@@ -7,7 +7,7 @@ ignore =
     S403 # pickle
     S301 # pickle
     W503 # line break before binary operator
-    E203 # conflicts with black
+    S101 # Don't complain about asserts
 exclude =
     .tox,
     .git,
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index bb7d41c21..318e9b33e 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -3,43 +3,6 @@ name: Tests
 on: [ push, pull_request ]
 
 jobs:
-  lint:
-    name: Lint
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [ 3.6, 3.9 ]
-        tox-env: [ manifest, flake8, pyroma, mypy ]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: pip install tox
-      - name: Run tox
-        run: tox -e ${{ matrix.tox-env }}
-  docs:
-    name: Documentation
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [ 3.6, 3.9 ]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: pip install tox
-      - name: Check RST conformity with doc8
-        run: tox -e doc8
-      # - name: Check docstring coverage
-      #   run: tox -e docstr-coverage
-      - name: Check documentation build with Sphinx
-        run: tox -e docs
   tests:
     name: Tests
     runs-on: ${{ matrix.os }}
diff --git a/setup.cfg b/setup.cfg
index a30697e58..e05db7001 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,12 +36,13 @@ keywords =
 
 [options]
 install_requires =
-    indra
+    indra @ git+https://github.com/sorgerlab/indra.git
     neo4j
     click
     more_click
     class-resolver>=0.0.9
     pystow>=0.1.6
+    pyobo
 
 include_package_data = True
 python_requires = >=3.6
diff --git a/src/indra_cogex/__init__.py b/src/indra_cogex/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py
new file mode 100644
index 000000000..a81b6489a
--- /dev/null
+++ b/src/indra_cogex/assembly/__init__.py
@@ -0,0 +1,50 @@
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+from indra_cogex.representation import Node
+
+
+class NodeAssembler:
+    def __init__(self, nodes: Optional[List[Node]] = None):
+        self.nodes = nodes if nodes else []
+        self.conflicts: List[Conflict] = []
+
+    def add_nodes(self, nodes: List[Node]):
+        self.nodes += nodes
+
+    def assemble_nodes(self) -> List[Node]:
+        nodes_by_id = defaultdict(list)
+        for node in self.nodes:
+            nodes_by_id[(node.db_ns, node.db_id)].append(node)
+
+        assembled_nodes = [
+            self.get_aggregate_node(db_ns, db_id, node_group)
+            for (db_ns, db_id), node_group in nodes_by_id.items()
+        ]
+        return assembled_nodes
+
+    def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node:
+        labels = set()
+        data: Dict[str, str] = {}
+        for node in nodes:
+            labels |= set(node.labels)
+            for data_key, data_val in node.data.items():
+                previous_val = data.get(data_key)
+                if previous_val and previous_val != data_val:
+                    self.conflicts.append(Conflict(data_key, previous_val, data_val))
+                else:
+                    data[data_key] = data_val
+        return Node(db_ns, db_id, sorted(labels), data)
+
+
+class Conflict:
+    def __init__(self, key, val1, val2):
+        self.key = key
+        self.val1 = val1
+        self.val2 = val2
+
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return f"Conflict({self.key}, {self.val1}, {self.val2})"
diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py
index 0f558509f..4d3000413 100644
--- a/src/indra_cogex/representation.py
+++ b/src/indra_cogex/representation.py
@@ -12,28 +12,41 @@ class Node:
 
     def __init__(
         self,
-        identifier: str,
+        db_ns: str,
+        db_id: str,
         labels: Collection[str],
         data: Optional[Mapping[str, Any]] = None,
     ):
         """Initialize the node.
 
-        :param identifier: The identifier of the node
-        :param labels: The collection of labels for the relation.
-        :param data: The optional data dictionary associated with the node.
+        Parameters
+        ----------
+        db_ns :
+            The namespace associated with the node. Uses the INDRA standard.
+        db_id :
+            The identifier within the namespace associated with the node.
+            Uses the INDRA standard.
+        labels :
+            A collection of labels for the node.
+        data :
+            An optional data dictionary associated with the node.
         """
-        self.identifier = identifier
+        if not db_ns or not db_id:
+            raise ValueError("Invalid namespace or ID.")
+        self.db_ns = db_ns
+        self.db_id = db_id
         self.labels = labels
         self.data = data if data else {}
 
     def to_json(self):
         """Serialize the node to JSON."""
         data = {k: v for k, v in self.data.items()}
-        data["id"] = self.identifier
+        data["db_ns"] = self.db_ns
+        data["db_id"] = self.db_id
         return {"labels": self.labels, "data": data}
 
     def _get_data_str(self):
-        pieces = ["id:'%s'" % self.identifier]
+        pieces = ["id:'%s:%s'" % (self.db_ns, self.db_id)]
         for k, v in self.data.items():
             if isinstance(v, str):
                 value = "'" + v.replace("'", "\\'") + "'"
@@ -60,36 +73,44 @@ class Relation:
 
     def __init__(
         self,
+        source_ns: str,
         source_id: str,
+        target_ns: str,
         target_id: str,
-        labels: Collection[str],
+        rel_type: str,
         data: Optional[Mapping[str, Any]] = None,
     ):
         """Initialize the relation.
 
         :param source_id: The identifier of the source node
         :param target_id: The identifier of the target node
-        :param labels: The collection of labels for the relation.
+        :param rel_type: The relation's type.
         :param data: The optional data dictionary associated with the relation.
         """
+        self.source_ns = source_ns
         self.source_id = source_id
+        self.target_ns = target_ns
         self.target_id = target_id
-        self.labels = list(labels)
+        self.rel_type = rel_type
         self.data = data if data else {}
 
     def to_json(self):
         """Serialize the relation to JSON."""
         return {
+            "source_ns": self.source_ns,
             "source_id": self.source_id,
+            "target_ns": self.target_ns,
             "target_id": self.target_id,
-            "labels": self.labels,
+            "rel_type": self.rel_type,
             "data": self.data,
         }
 
     def __str__(self):  # noqa:D105
         data_str = ", ".join(["%s:'%s'" % (k, v) for k, v in self.data.items()])
-        labels_str = ":".join(self.labels)
-        return f"({self.source_id})-[:{labels_str} {data_str}]->" f"({self.target_id})"
+        return (
+            f"({self.source_ns}, {self.source_id})-[:{self.rel_type} {data_str}]->"
+            f"({self.target_ns}, {self.target_id})"
+        )
 
     def __repr__(self):  # noqa:D105
         return str(self)
diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py
index 0eb6c2dfc..79dae97ce 100644
--- a/src/indra_cogex/sources/bgee/__init__.py
+++ b/src/indra_cogex/sources/bgee/__init__.py
@@ -32,20 +32,36 @@ def __init__(self, path: Union[None, str, Path] = None):
             self.expressions = pickle.load(fh)
 
     def get_nodes(self):  # noqa:D102
-        for context_id in self.expressions:
+        for context in self.expressions:
+            context_ns, context_id = get_context(context)
             yield Node(
+                context_ns,
                 context_id,
                 ["BioEntity"],
                 data={"name": pyobo.get_name_by_curie(context_id)},
             )
         for hgnc_id in set.union(*[set(v) for v in self.expressions.values()]):
             yield Node(
-                f"HGNC:{hgnc_id}",
+                "HGNC",
+                hgnc_id,
                 ["BioEntity"],
                 data={"name": pyobo.get_name("hgnc", hgnc_id)},
             )
 
     def get_relations(self):  # noqa:D102
-        for context_id, hgnc_ids in self.expressions.items():
+        data = {"source": self.name}
+        for context, hgnc_ids in self.expressions.items():
+            context_ns, context_id = get_context(context)
             for hgnc_id in hgnc_ids:
-                yield Relation(f"HGNC:{hgnc_id}", context_id, [self.rel_type])
+                yield Relation(
+                    "HGNC", hgnc_id, context_ns, context_id, self.rel_type, data
+                )
+
+
+def get_context(context):
+    context_ns, context_id = context.split(":", maxsplit=1)
+    if context_ns == "UBERON":
+        context_id = f"UBERON:{context_id}"
+    elif context_ns == "CL":
+        context_id = f"CL:{context_id}"
+    return context_ns, context_id
diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py
index bdc4b0014..f29759ae7 100644
--- a/src/indra_cogex/sources/cli.py
+++ b/src/indra_cogex/sources/cli.py
@@ -6,9 +6,12 @@
 from textwrap import dedent
 
 import click
+import pystow
 from more_click import verbose_option
 
 from . import processor_resolver
+from .processor import Processor
+from ..assembly import NodeAssembler
 
 
 @click.command()
@@ -32,6 +35,7 @@
 def main(load: bool, load_only: bool, force: bool):
     """Generate and import Neo4j nodes and edges tables."""
     paths = []
+    na = NodeAssembler()
     for processor_cls in processor_resolver:
         if not processor_cls.importable:
             continue
@@ -44,21 +48,34 @@ def main(load: bool, load_only: bool, force: bool):
             ):
                 click.secho("Processing...", fg="green")
                 processor = processor_cls()
+                # FIXME: this is redundant, we get nodes twice
+                na.add_nodes(list(processor.get_nodes()))
                 processor.dump()
         paths.append((processor_cls.nodes_path, processor_cls.edges_path))
 
+    # FIXME: This doesn't work unless the processors are also running and
+    # getting nodes
+    nodes_path = pystow.module("indra", "cogex", "assembled").join(name="nodes.tsv.gz")
+    if not load_only:
+        if force or not nodes_path.is_file():
+            # Now create and dump the assembled nodes
+            assembled_nodes = na.assemble_nodes()
+            assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id))
+            Processor._dump_nodes_to_path(assembled_nodes, nodes_path)
+
     if load or load_only:
         command = dedent(
-            """\
+            f"""\
         neo4j-admin import \\
           --database=indra \\
           --delimiter='TAB' \\
           --skip-duplicate-nodes=true \\
-          --skip-bad-relationships=true
+          --skip-bad-relationships=true \\
+          --nodes {nodes_path}
         """
         ).rstrip()
-        for node_path, edge_path in paths:
-            command += f"\\\n  --nodes {node_path} \\\n  --relationships {edge_path}"
+        for _, edge_path in paths:
+            command += f"\\\n  --relationships {edge_path}"
 
         click.secho("Running shell command:")
         click.secho(command, fg="blue")
diff --git a/src/indra_cogex/sources/goa/__init__.py b/src/indra_cogex/sources/goa/__init__.py
index f67fe2f1d..44ca831ed 100644
--- a/src/indra_cogex/sources/goa/__init__.py
+++ b/src/indra_cogex/sources/goa/__init__.py
@@ -42,20 +42,17 @@ def __init__(self):
 
     def get_nodes(self):  # noqa:D102
         for go_node in self.df["GO_ID"].unique():
-            yield Node(go_node, ["BioEntity"])
+            yield Node("GO", go_node, ["BioEntity"])
         for hgnc_id in self.df["HGNC_ID"].unique():
-            yield Node(f"HGNC:{hgnc_id}", ["BioEntity"])
+            yield Node("HGNC", hgnc_id, ["BioEntity"])
 
     def get_relations(self):  # noqa:D102
         rel_type = "associated_with"
         for (go_id, hgnc_id), ecs in self.df.groupby(["GO_ID", "HGNC_ID"])["EC"]:
             all_ecs = ",".join(sorted(set(ecs)))
-            source = f"HGNC:{hgnc_id}"
-            # Note that we don't add the extra GO: by current convention
-            target = go_id
             # Possible properties could be e.g., evidence codes
-            data = {"evidence_codes:string": all_ecs}
-            yield Relation(source, target, [rel_type], data)
+            data = {"evidence_codes:string": all_ecs, "source": self.name}
+            yield Relation("HGNC", hgnc_id, "GO", go_id, rel_type, data)
 
 
 def load_goa(url: str) -> pd.DataFrame:
diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py
index db0d4fb01..5771e0e8b 100644
--- a/src/indra_cogex/sources/indra_db/__init__.py
+++ b/src/indra_cogex/sources/indra_db/__init__.py
@@ -2,20 +2,24 @@
 
 """Processor for the INDRA database."""
 
+import json
 import logging
 import pickle
 from pathlib import Path
-from typing import Union
+from tqdm import tqdm
+from typing import Tuple, Union
 
 import humanize
 import pandas as pd
 import pystow
 
 from indra.ontology.bio import bio_ontology
+from indra.databases.identifiers import ensure_prefix_if_needed
 from indra_cogex.representation import Node, Relation
 from indra_cogex.sources.processor import Processor
 
 logger = logging.getLogger(__name__)
+tqdm.pandas()
 
 
 # If you don't have the data, get it from:
@@ -38,16 +42,12 @@ def __init__(self, path: Union[None, str, Path] = None):
         elif isinstance(path, str):
             path = Path(path)
         with open(path, "rb") as fh:
+            logger.info("Loading %s" % path)
             df = pickle.load(fh)
         logger.info("Loaded %s rows from %s", humanize.intword(len(df)), path)
         self.df = df
+        logger.info("Fixing ID and naming issues...")
         for side in "AB":
-            self.df[side] = [
-                f"{prefix}:{identifier}"
-                for prefix, identifier in self.df[
-                    [f"ag{side}_ns", f"ag{side}_id"]
-                ].values
-            ]
             # A lot of the names in the SIF dump are all over
             self.df[f"ag{side}_name"] = [
                 bio_ontology.get_name(prefix, identifier)
@@ -55,28 +55,77 @@ def __init__(self, path: Union[None, str, Path] = None):
                     [f"ag{side}_ns", f"ag{side}_id"]
                 ].values
             ]
+            self.df[f"ag{side}_ns"], self.df[f"ag{side}_id"] = list(
+                zip(
+                    *[
+                        fix_id(db_ns, db_id)
+                        for db_ns, db_id in tqdm(
+                            zip(list(df[f"ag{side}_ns"]), list(df[f"ag{side}_id"])),
+                            total=len(df),
+                            desc="Fixing IDs",
+                        )
+                    ]
+                )
+            )
+        self.df["source_counts"] = self.df["source_counts"].apply(json.dumps)
 
     def get_nodes(self):  # noqa:D102
-        df = (
-            pd.concat([self._get_nodes("A"), self._get_nodes("B")], ignore_index=True)
-            .drop_duplicates()
-            .sort_values("curie")
-        )
-        for curie, name in df.values:
-            yield Node(curie, ["BioEntity"], dict(name=name))
-
-    def _get_nodes(self, side: str) -> pd.DataFrame:
-        return self.df[[side, f"ag{side}_name"]].rename(
-            columns={
-                side: "curie",
-                f"ag{side}_name": "name",
-            }
-        )
+        df = pd.concat(
+            [
+                self.df[["agA_ns", "agA_id", "agA_name"]].rename(
+                    columns={"agA_ns": "ns", "agA_id": "id", "agA_name": "name"}
+                ),
+                self.df[["agB_ns", "agB_id", "agB_name"]].rename(
+                    columns={"agB_ns": "ns", "agB_id": "id", "agB_name": "name"}
+                ),
+            ],
+            ignore_index=True,
+        ).drop_duplicates()
+        for db_ns, db_id, name in df.values:
+            yield Node(db_ns, db_id, ["BioEntity"], dict(name=name))
 
     def get_relations(self):  # noqa:D102
-        columns = ["A", "B", "stmt_type", "evidence_count", "stmt_hash"]
-        for source, target, stmt_type, ev_count, stmt_hash in (
+        columns = [
+            "agA_ns",
+            "agA_id",
+            "agB_ns",
+            "agB_id",
+            "stmt_type",
+            "source_counts",
+            "stmt_hash",
+        ]
+        for (
+            source_ns,
+            source_id,
+            target_ns,
+            target_id,
+            stmt_type,
+            source_counts,
+            stmt_hash,
+        ) in (
             self.df[columns].drop_duplicates().values
         ):
-            data = {"stmt_hash:long": stmt_hash, "evidence_count:long": ev_count}
-            yield Relation(source, target, [stmt_type], data)
+            data = {"stmt_hash:long": stmt_hash, "source_counts:string": source_counts}
+            yield Relation(
+                source_ns,
+                source_id,
+                target_ns,
+                target_id,
+                stmt_type,
+                data,
+            )
+
+
+def fix_id(db_ns: str, db_id: str) -> Tuple[str, str]:
+    """Fix ID issues specific to the SIF dump."""
+    if db_ns == "GO":
+        if db_id.isnumeric():
+            db_id = "0" * (7 - len(db_id)) + db_id
+    if db_ns == "EFO" and db_id.startswith("EFO:"):
+        db_id = db_id[4:]
+    if db_ns == "UP" and db_id.startswith("SL"):
+        db_ns = "UPLOC"
+    if db_ns == "UP" and "-" in db_id and not db_id.startswith("SL-"):
+        db_id = db_id.split("-")[0]
+    db_id = ensure_prefix_if_needed(db_ns, db_id)
+    return db_ns, db_id
diff --git a/src/indra_cogex/sources/indra_ontology/__init__.py b/src/indra_cogex/sources/indra_ontology/__init__.py
index 928a7919f..b980dcbbb 100644
--- a/src/indra_cogex/sources/indra_ontology/__init__.py
+++ b/src/indra_cogex/sources/indra_ontology/__init__.py
@@ -34,17 +34,13 @@ def __init__(self, ontology: Optional[IndraOntology] = None):
 
     def get_nodes(self):  # noqa:D102
         for node, data in self.ontology.nodes(data=True):
-            yield Node(_norm(node), ["BioEntity"], data)
+            db_ns, db_id = self.ontology.get_ns_id(node)
+            yield Node(db_ns, db_id, ["BioEntity"], data)
 
     def get_relations(self):  # noqa:D102
         for source, target, data in self.ontology.edges(data=True):
+            source_ns, source_id = self.ontology.get_ns_id(source)
+            target_ns, target_id = self.ontology.get_ns_id(target)
             data = copy.copy(data)
             edge_type = data.pop("type")
-            yield Relation(_norm(source), _norm(target), [edge_type], data)
-
-
-def _norm(node: str) -> str:
-    ns, identifier = node.split(":", 1)
-    if identifier.startswith(f"{ns}:"):
-        identifier = identifier[len(ns) + 1 :]
-    return f"{ns}:{identifier}"
+            yield Relation(source_ns, source_id, target_ns, target_id, edge_type, data)
diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py
index 86d798c3c..fb4e59ade 100644
--- a/src/indra_cogex/sources/pathways/__init__.py
+++ b/src/indra_cogex/sources/pathways/__init__.py
@@ -5,6 +5,9 @@
 import logging
 from typing import ClassVar
 
+from indra.databases import hgnc_client
+from indra.databases import uniprot_client
+from indra.databases.identifiers import get_ns_id_from_identifiers
 import pyobo
 import pyobo.api.utils
 from pyobo.struct import has_part
@@ -30,9 +33,11 @@ class PyoboProcessor(Processor):
     def get_nodes(self):  # noqa:D102
         # TODO add license
         version = pyobo.api.utils.get_version(self.prefix)
-        for identifier, name in pyobo.get_id_name_mapping("wikipathways").items():
+        for identifier, name in pyobo.get_id_name_mapping(self.prefix).items():
+            db_ns, db_id = get_ns_id_from_identifiers(self.prefix, identifier)
             yield Node(
-                f"{self.prefix}:{identifier}",
+                db_ns,
+                db_id,
                 ["BioEntity"],
                 dict(name=name, version=version),
             )
@@ -40,12 +45,38 @@ def get_nodes(self):  # noqa:D102
     def get_relations(self):  # noqa:D102
         df = pyobo.get_filtered_relations_df(self.prefix, self.relation)
         for identifier, t_prefix, t_identifier in df.values:
+            pathway_ns, pathway_id = get_ns_id_from_identifiers(self.prefix, identifier)
+            gene_ns, gene_id = self.get_gene(t_prefix, t_identifier)
+            if not gene_ns:
+                continue
             yield Relation(
-                f"{self.prefix}:{identifier}",
-                f"{t_prefix}:{t_identifier}",
-                [self.relation_label],
+                pathway_ns,
+                pathway_id,
+                gene_ns,
+                gene_id,
+                self.relation_label,
+                dict(source=self.name),
             )
 
+    def get_gene(self, prefix, identifier):
+        if prefix == "ncbigene":
+            hgnc_id = hgnc_client.get_hgnc_from_entrez(identifier)
+            if hgnc_id:
+                return "HGNC", hgnc_id
+            else:
+                return "EGID", identifier
+        elif prefix == "uniprot":
+            # Some of the UniProt IDs are isoforms, for now, we just strip
+            # these off. We could do something more principled later.
+            if "-" in identifier:
+                identifier, _ = identifier.split("-")
+            hgnc_id = uniprot_client.get_hgnc_id(identifier)
+            if hgnc_id:
+                return "HGNC", hgnc_id
+            else:
+                return "UP", identifier
+        return None, None
+
 
 class WikipathwaysProcessor(PyoboProcessor):
     """Processor for WikiPathways gene-pathway links."""
@@ -65,7 +96,3 @@ class ReactomeProcessor(PyoboProcessor):
     relation = has_part
     relation_label = "haspart"
     importable = True
-
-
-if __name__ == "__main__":
-    WikipathwaysProcessor.cli()
diff --git a/src/indra_cogex/sources/pathways/__main__.py b/src/indra_cogex/sources/pathways/__main__.py
new file mode 100644
index 000000000..2e9b46bff
--- /dev/null
+++ b/src/indra_cogex/sources/pathways/__main__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+"""Run the pathways processor using ``python -m indra_cogex.sources.pathways``."""
+
+from . import ReactomeProcessor, WikipathwaysProcessor
+
+if __name__ == "__main__":
+    ReactomeProcessor.cli()
+    WikipathwaysProcessor.cli()
diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py
index 90d64fece..1d4d57974 100644
--- a/src/indra_cogex/sources/processor.py
+++ b/src/indra_cogex/sources/processor.py
@@ -4,8 +4,8 @@
 
 import csv
 import gzip
+import logging
 from abc import ABC, abstractmethod
-from operator import attrgetter
 from pathlib import Path
 from typing import ClassVar, Iterable
 
@@ -14,12 +14,16 @@
 from more_click import verbose_option
 from tqdm import tqdm
 
+from indra.databases import identifiers
+from indra.statements.validate import assert_valid_db_refs
+
 from indra_cogex.representation import Node, Relation
 
 __all__ = [
     "Processor",
 ]
 
+logger = logging.getLogger(__name__)
 
 # deal with importing from wherever with
 #  https://stackoverflow.com/questions/36922843/neo4j-3-x-load-csv-absolute-file-path
@@ -76,49 +80,37 @@ def _dump_nodes(self) -> Path:
         if self.nodes_path.is_file():
             return self.nodes_path
 
-        nodes = sorted(self.get_nodes(), key=attrgetter("identifier"))
+        nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id))
+        return self._dump_nodes_to_path(nodes, self.nodes_path, sample_path)
+
+    @staticmethod
+    def _dump_nodes_to_path(nodes, nodes_path, sample_path=None):
+        nodes = list(validate_nodes(nodes))
         metadata = sorted(set(key for node in nodes for key in node.data))
         node_rows = (
             (
-                node.identifier,
-                "|".join(node.labels),
+                norm_id(node.db_ns, node.db_id),
+                ";".join(node.labels),
                 *[node.data.get(key, "") for key in metadata],
             )
             for node in tqdm(nodes, desc="Nodes", unit_scale=True)
         )
 
-        with gzip.open(self.nodes_path, mode="wt") as node_file:
+        header = "id:ID", ":LABEL", *metadata
+        with gzip.open(nodes_path, mode="wt") as node_file:
             node_writer = csv.writer(node_file, delimiter="\t")  # type: ignore
-            with sample_path.open("w") as node_sample_file:
-                node_sample_writer = csv.writer(node_sample_file, delimiter="\t")
-
-                header = "id:ID", ":LABEL", *metadata
-                node_sample_writer.writerow(header)
-                node_writer.writerow(header)
-
-                for _, node_row in zip(range(10), node_rows):
-                    node_sample_writer.writerow(node_row)
-                    node_writer.writerow(node_row)
-
+            node_writer.writerow(header)
+            if sample_path:
+                with sample_path.open("w") as node_sample_file:
+                    node_sample_writer = csv.writer(node_sample_file, delimiter="\t")
+                    node_sample_writer.writerow(header)
+                    for _, node_row in zip(range(10), node_rows):
+                        node_sample_writer.writerow(node_row)
+                        node_writer.writerow(node_row)
             # Write remaining nodes
             node_writer.writerows(node_rows)
 
-            # cypher = dedent(f'''\
-            #     CREATE CONSTRAINT ON (n:{ntype}) ASSERT n.id IS UNIQUE;
-            #     USING PERIODIC COMMIT
-            #     LOAD CSV WITH HEADERS FROM "file://{data_path.as_posix()}" AS row FIELDTERMINATOR '\\t'
-            #     MERGE (n:{ntype} {{ id: row.identifier }})
-            #     ''')
-            # if metadata:
-            #     creates = '\n'.join(
-            #         f'n.{key} = row.{key}'
-            #         for key in metadata
-            #     )
-            #     cypher += f'ON CREATE SET {creates}'
-            # with cypher_path.open('w') as file:
-            #     print(cypher, file=file)
-
-        return self.nodes_path
+        return nodes_path
 
     def _dump_edges(self) -> Path:
         sample_path = self.module.join(name="edges_sample.tsv")
@@ -126,13 +118,16 @@ def _dump_edges(self) -> Path:
             return self.edges_path
 
         rels = self.get_relations()
-        rels = sorted(rels, key=lambda r: (r.source_id, r.target_id))
+        rels = validate_relations(rels)
+        rels = sorted(
+            rels, key=lambda r: (r.source_ns, r.source_id, r.target_ns, r.target_id)
+        )
         metadata = sorted(set(key for rel in rels for key in rel.data))
         edge_rows = (
             (
-                rel.source_id,
-                rel.target_id,
-                "|".join(sorted(rel.labels)),
+                norm_id(rel.source_ns, rel.source_id),
+                norm_id(rel.target_ns, rel.target_id),
+                rel.rel_type,
                 *[rel.data.get(key) for key in metadata],
             )
             for rel in tqdm(rels, desc="Edges", unit_scale=True)
@@ -153,3 +148,38 @@ def _dump_edges(self) -> Path:
             # Write remaining edges
             edge_writer.writerows(edge_rows)
         return self.edges_path
+
+
+def norm_id(db_ns, db_id):
+    identifiers_ns = identifiers.get_identifiers_ns(db_ns)
+    identifiers_id = db_id
+    if not identifiers_ns:
+        identifiers_ns = db_ns.lower()
+    else:
+        ns_embedded = identifiers.identifiers_registry.get(identifiers_ns, {}).get(
+            "namespace_embedded"
+        )
+        if ns_embedded:
+            identifiers_id = identifiers_id[len(identifiers_ns) + 1 :]
+    return f"{identifiers_ns}:{identifiers_id}"
+
+
+def validate_nodes(nodes):
+    for idx, node in enumerate(nodes):
+        try:
+            assert_valid_db_refs({node.db_ns: node.db_id})
+            yield node
+        except Exception as e:
+            logger.info(f"{idx}: {node} - {e}")
+            continue
+
+
+def validate_relations(relations):
+    for idx, rel in enumerate(relations):
+        try:
+            assert_valid_db_refs({rel.source_ns: rel.source_id})
+            assert_valid_db_refs({rel.target_ns: rel.target_id})
+            yield rel
+        except Exception as e:
+            logger.info(f"{idx}: {rel} - {e}")
+            continue
diff --git a/tests/test_assembly.py b/tests/test_assembly.py
new file mode 100644
index 000000000..e6185c38d
--- /dev/null
+++ b/tests/test_assembly.py
@@ -0,0 +1,40 @@
+from indra_cogex.assembly import NodeAssembler
+from indra_cogex.representation import Node
+
+
+def test_add_nodes():
+    na = NodeAssembler([Node("x", "y", ["l"])])
+    assert len(na.nodes) == 1
+    na.add_nodes([Node("y", "z", ["l"])])
+    assert len(na.nodes) == 2
+
+
+def test_merge_properties():
+    n1 = Node("ns", "id", ["l"], {"k1": "v1"})
+    n2 = Node("ns", "id", ["l"], {"k2": "v2"})
+    na = NodeAssembler([n1, n2])
+    ans = na.assemble_nodes()
+    assert len(ans) == 1
+    assert ans[0].data == {"k1": "v1", "k2": "v2"}
+
+
+def test_merge_labels():
+    n1 = Node("ns", "id", ["l1", "l2"])
+    n2 = Node("ns", "id", ["l2", "l3"])
+    na = NodeAssembler([n1, n2])
+    ans = na.assemble_nodes()
+    assert len(ans) == 1
+    assert set(ans[0].labels) == {"l1", "l2", "l3"}
+
+
+def test_merge_conflict():
+    n1 = Node("ns", "id", ["l"], {"k1": "v1"})
+    n2 = Node("ns", "id", ["l"], {"k1": "v2"})
+    na = NodeAssembler([n1, n2])
+    ans = na.assemble_nodes()
+    assert len(ans) == 1
+    assert ans[0].data == {"k1": "v1"}
+    assert len(na.conflicts) == 1
+    assert na.conflicts[0].key == "k1"
+    assert na.conflicts[0].val1 == "v1"
+    assert na.conflicts[0].val2 == "v2"
diff --git a/tests/test_indra_db.py b/tests/test_indra_db.py
new file mode 100644
index 000000000..9172c1e45
--- /dev/null
+++ b/tests/test_indra_db.py
@@ -0,0 +1,9 @@
+from indra_cogex.sources.indra_db import fix_id
+
+
+def test_fix_id():
+    assert fix_id("EFO", "EFO:12345") == ("EFO", "12345")
+    assert fix_id("GO", "123") == ("GO", "GO:0000123")
+    assert fix_id("CHEBI", "123") == ("CHEBI", "CHEBI:123")
+    assert fix_id("UP", "P12345-6") == ("UP", "P12345")
+    assert fix_id("UP", "SL-123") == ("UPLOC", "SL-123")
diff --git a/tests/test_processor.py b/tests/test_processor.py
new file mode 100644
index 000000000..6abaa4c44
--- /dev/null
+++ b/tests/test_processor.py
@@ -0,0 +1,6 @@
+from indra_cogex.sources.processor import norm_id
+
+
+def test_norm_id():
+    assert norm_id("UP", "P12345") == "uniprot:P12345"
+    assert norm_id("CHEBI", "CHEBI:12345") == "chebi:12345"
diff --git a/tox.ini b/tox.ini
index 7273319ac..b4bcc7215 100644
--- a/tox.ini
+++ b/tox.ini
@@ -28,8 +28,6 @@ passenv =
 deps =
     coverage
     pytest
-extras =
-    pandas
 whitelist_externals =
     /bin/cat
     /bin/cp
@@ -79,7 +77,7 @@ description = Run the flake8 tool with several plugins (bandit, docstrings, impo
 [testenv:black]
 deps = black
 skip_install = true
-commands = black src/ tests/ setup.py
+commands = black src/indra_cogex tests/ setup.py
 description = Run the black tool
 
 [testenv:mypy]