diff --git a/.flake8 b/.flake8 index a711387bb..09307d270 100644 --- a/.flake8 +++ b/.flake8 @@ -7,7 +7,7 @@ ignore = S403 # pickle S301 # pickle W503 # line break before binary operator - E203 # conflicts with black + S101 # Don't complain about asserts exclude = .tox, .git, diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bb7d41c21..318e9b33e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -3,43 +3,6 @@ name: Tests on: [ push, pull_request ] jobs: - lint: - name: Lint - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [ 3.6, 3.9 ] - tox-env: [ manifest, flake8, pyroma, mypy ] - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: pip install tox - - name: Run tox - run: tox -e ${{ matrix.tox-env }} - docs: - name: Documentation - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [ 3.6, 3.9 ] - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: pip install tox - - name: Check RST conformity with doc8 - run: tox -e doc8 - # - name: Check docstring coverage - # run: tox -e docstr-coverage - - name: Check documentation build with Sphinx - run: tox -e docs tests: name: Tests runs-on: ${{ matrix.os }} diff --git a/setup.cfg b/setup.cfg index a30697e58..e05db7001 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,12 +36,13 @@ keywords = [options] install_requires = - indra + indra @ git+https://github.com/sorgerlab/indra.git neo4j click more_click class-resolver>=0.0.9 pystow>=0.1.6 + pyobo include_package_data = True python_requires = >=3.6 diff --git a/src/indra_cogex/__init__.py b/src/indra_cogex/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py new file mode 100644 index 000000000..a81b6489a --- /dev/null +++ b/src/indra_cogex/assembly/__init__.py @@ -0,0 +1,50 @@ +from collections import defaultdict +from typing import Dict, List, Optional + +from indra_cogex.representation import Node + + +class NodeAssembler: + def __init__(self, nodes: Optional[List[Node]] = None): + self.nodes = nodes if nodes else [] + self.conflicts: List[Conflict] = [] + + def add_nodes(self, nodes: List[Node]): + self.nodes += nodes + + def assemble_nodes(self) -> List[Node]: + nodes_by_id = defaultdict(list) + for node in self.nodes: + nodes_by_id[(node.db_ns, node.db_id)].append(node) + + assembled_nodes = [ + self.get_aggregate_node(db_ns, db_id, node_group) + for (db_ns, db_id), node_group in nodes_by_id.items() + ] + return assembled_nodes + + def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node: + labels = set() + data: Dict[str, str] = {} + for node in nodes: + labels |= set(node.labels) + for data_key, data_val in node.data.items(): + previous_val = data.get(data_key) + if previous_val and previous_val != data_val: + self.conflicts.append(Conflict(data_key, previous_val, data_val)) + else: + data[data_key] = data_val + return Node(db_ns, db_id, sorted(labels), data) + + +class Conflict: + def __init__(self, key, val1, val2): + self.key = key + self.val1 = val1 + self.val2 = val2 + + def __repr__(self): + return str(self) + + def __str__(self): + return f"Conflict({self.key}, {self.val1}, {self.val2})" diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py index 0f558509f..4d3000413 100644 --- a/src/indra_cogex/representation.py +++ b/src/indra_cogex/representation.py @@ -12,28 +12,41 @@ class Node: def __init__( self, - identifier: str, + db_ns: str, + db_id: str, labels: Collection[str], data: Optional[Mapping[str, Any]] = None, ): """Initialize the node. - :param identifier: The identifier of the node - :param labels: The collection of labels for the relation. - :param data: The optional data dictionary associated with the node. + Parameters + ---------- + db_ns : + The namespace associated with the node. Uses the INDRA standard. + db_id : + The identifier within the namespace associated with the node. + Uses the INDRA standard. + labels : + A collection of labels for the node. + data : + An optional data dictionary associated with the node. """ - self.identifier = identifier + if not db_ns or not db_id: + raise ValueError("Invalid namespace or ID.") + self.db_ns = db_ns + self.db_id = db_id self.labels = labels self.data = data if data else {} def to_json(self): """Serialize the node to JSON.""" data = {k: v for k, v in self.data.items()} - data["id"] = self.identifier + data["db_ns"] = self.db_ns + data["db_id"] = self.db_id return {"labels": self.labels, "data": data} def _get_data_str(self): - pieces = ["id:'%s'" % self.identifier] + pieces = ["id:'%s:%s'" % (self.db_ns, self.db_id)] for k, v in self.data.items(): if isinstance(v, str): value = "'" + v.replace("'", "\\'") + "'" @@ -60,36 +73,44 @@ class Relation: def __init__( self, + source_ns: str, source_id: str, + target_ns: str, target_id: str, - labels: Collection[str], + rel_type: str, data: Optional[Mapping[str, Any]] = None, ): """Initialize the relation. :param source_id: The identifier of the source node :param target_id: The identifier of the target node - :param labels: The collection of labels for the relation. + :param rel_type: The relation's type. :param data: The optional data dictionary associated with the relation. """ + self.source_ns = source_ns self.source_id = source_id + self.target_ns = target_ns self.target_id = target_id - self.labels = list(labels) + self.rel_type = rel_type self.data = data if data else {} def to_json(self): """Serialize the relation to JSON.""" return { + "source_ns": self.source_ns, "source_id": self.source_id, + "target_ns": self.target_ns, "target_id": self.target_id, - "labels": self.labels, + "rel_type": self.rel_type, "data": self.data, } def __str__(self): # noqa:D105 data_str = ", ".join(["%s:'%s'" % (k, v) for k, v in self.data.items()]) - labels_str = ":".join(self.labels) - return f"({self.source_id})-[:{labels_str} {data_str}]->" f"({self.target_id})" + return ( + f"({self.source_ns}, {self.source_id})-[:{self.rel_type} {data_str}]->" + f"({self.target_ns}, {self.target_id})" + ) def __repr__(self): # noqa:D105 return str(self) diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py index 0eb6c2dfc..79dae97ce 100644 --- a/src/indra_cogex/sources/bgee/__init__.py +++ b/src/indra_cogex/sources/bgee/__init__.py @@ -32,20 +32,36 @@ def __init__(self, path: Union[None, str, Path] = None): self.expressions = pickle.load(fh) def get_nodes(self): # noqa:D102 - for context_id in self.expressions: + for context in self.expressions: + context_ns, context_id = get_context(context) yield Node( + context_ns, context_id, ["BioEntity"], data={"name": pyobo.get_name_by_curie(context_id)}, ) for hgnc_id in set.union(*[set(v) for v in self.expressions.values()]): yield Node( - f"HGNC:{hgnc_id}", + "HGNC", + hgnc_id, ["BioEntity"], data={"name": pyobo.get_name("hgnc", hgnc_id)}, ) def get_relations(self): # noqa:D102 - for context_id, hgnc_ids in self.expressions.items(): + data = {"source": self.name} + for context, hgnc_ids in self.expressions.items(): + context_ns, context_id = get_context(context) for hgnc_id in hgnc_ids: - yield Relation(f"HGNC:{hgnc_id}", context_id, [self.rel_type]) + yield Relation( + "HGNC", hgnc_id, context_ns, context_id, self.rel_type, data + ) + + +def get_context(context): + context_ns, context_id = context.split(":", maxsplit=1) + if context_ns == "UBERON": + context_id = f"UBERON:{context_id}" + elif context_ns == "CL": + context_id = f"CL:{context_id}" + return context_ns, context_id diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py index bdc4b0014..f29759ae7 100644 --- a/src/indra_cogex/sources/cli.py +++ b/src/indra_cogex/sources/cli.py @@ -6,9 +6,12 @@ from textwrap import dedent import click +import pystow from more_click import verbose_option from . import processor_resolver +from .processor import Processor +from ..assembly import NodeAssembler @click.command() @@ -32,6 +35,7 @@ def main(load: bool, load_only: bool, force: bool): """Generate and import Neo4j nodes and edges tables.""" paths = [] + na = NodeAssembler() for processor_cls in processor_resolver: if not processor_cls.importable: continue @@ -44,21 +48,34 @@ def main(load: bool, load_only: bool, force: bool): ): click.secho("Processing...", fg="green") processor = processor_cls() + # FIXME: this is redundant, we get nodes twice + na.add_nodes(list(processor.get_nodes())) processor.dump() paths.append((processor_cls.nodes_path, processor_cls.edges_path)) + # FIXME: This doesn't work unless the processors are also running and + # getting nodes + nodes_path = pystow.module("indra", "cogex", "assembled").join(name="nodes.tsv.gz") + if not load_only: + if force or not nodes_path.is_file(): + # Now create and dump the assembled nodes + assembled_nodes = na.assemble_nodes() + assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id)) + Processor._dump_nodes_to_path(assembled_nodes, nodes_path) + if load or load_only: command = dedent( - """\ + f"""\ neo4j-admin import \\ --database=indra \\ --delimiter='TAB' \\ --skip-duplicate-nodes=true \\ - --skip-bad-relationships=true + --skip-bad-relationships=true \\ + --nodes {nodes_path} """ ).rstrip() - for node_path, edge_path in paths: - command += f"\\\n --nodes {node_path} \\\n --relationships {edge_path}" + for _, edge_path in paths: + command += f"\\\n --relationships {edge_path}" click.secho("Running shell command:") click.secho(command, fg="blue") diff --git a/src/indra_cogex/sources/goa/__init__.py b/src/indra_cogex/sources/goa/__init__.py index f67fe2f1d..44ca831ed 100644 --- a/src/indra_cogex/sources/goa/__init__.py +++ b/src/indra_cogex/sources/goa/__init__.py @@ -42,20 +42,17 @@ def __init__(self): def get_nodes(self): # noqa:D102 for go_node in self.df["GO_ID"].unique(): - yield Node(go_node, ["BioEntity"]) + yield Node("GO", go_node, ["BioEntity"]) for hgnc_id in self.df["HGNC_ID"].unique(): - yield Node(f"HGNC:{hgnc_id}", ["BioEntity"]) + yield Node("HGNC", hgnc_id, ["BioEntity"]) def get_relations(self): # noqa:D102 rel_type = "associated_with" for (go_id, hgnc_id), ecs in self.df.groupby(["GO_ID", "HGNC_ID"])["EC"]: all_ecs = ",".join(sorted(set(ecs))) - source = f"HGNC:{hgnc_id}" - # Note that we don't add the extra GO: by current convention - target = go_id # Possible properties could be e.g., evidence codes - data = {"evidence_codes:string": all_ecs} - yield Relation(source, target, [rel_type], data) + data = {"evidence_codes:string": all_ecs, "source": self.name} + yield Relation("HGNC", hgnc_id, "GO", go_id, rel_type, data) def load_goa(url: str) -> pd.DataFrame: diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py index db0d4fb01..5771e0e8b 100644 --- a/src/indra_cogex/sources/indra_db/__init__.py +++ b/src/indra_cogex/sources/indra_db/__init__.py @@ -2,20 +2,24 @@ """Processor for the INDRA database.""" +import json import logging import pickle from pathlib import Path -from typing import Union +from tqdm import tqdm +from typing import Tuple, Union import humanize import pandas as pd import pystow from indra.ontology.bio import bio_ontology +from indra.databases.identifiers import ensure_prefix_if_needed from indra_cogex.representation import Node, Relation from indra_cogex.sources.processor import Processor logger = logging.getLogger(__name__) +tqdm.pandas() # If you don't have the data, get it from: @@ -38,16 +42,12 @@ def __init__(self, path: Union[None, str, Path] = None): elif isinstance(path, str): path = Path(path) with open(path, "rb") as fh: + logger.info("Loading %s" % path) df = pickle.load(fh) logger.info("Loaded %s rows from %s", humanize.intword(len(df)), path) self.df = df + logger.info("Fixing ID and naming issues...") for side in "AB": - self.df[side] = [ - f"{prefix}:{identifier}" - for prefix, identifier in self.df[ - [f"ag{side}_ns", f"ag{side}_id"] - ].values - ] # A lot of the names in the SIF dump are all over self.df[f"ag{side}_name"] = [ bio_ontology.get_name(prefix, identifier) @@ -55,28 +55,77 @@ def __init__(self, path: Union[None, str, Path] = None): [f"ag{side}_ns", f"ag{side}_id"] ].values ] + self.df[f"ag{side}_ns"], self.df[f"ag{side}_id"] = list( + zip( + *[ + fix_id(db_ns, db_id) + for db_ns, db_id in tqdm( + zip(list(df[f"ag{side}_ns"]), list(df[f"ag{side}_id"])), + total=len(df), + desc="Fixing IDs", + ) + ] + ) + ) + self.df["source_counts"] = self.df["source_counts"].apply(json.dumps) def get_nodes(self): # noqa:D102 - df = ( - pd.concat([self._get_nodes("A"), self._get_nodes("B")], ignore_index=True) - .drop_duplicates() - .sort_values("curie") - ) - for curie, name in df.values: - yield Node(curie, ["BioEntity"], dict(name=name)) - - def _get_nodes(self, side: str) -> pd.DataFrame: - return self.df[[side, f"ag{side}_name"]].rename( - columns={ - side: "curie", - f"ag{side}_name": "name", - } - ) + df = pd.concat( + [ + self.df[["agA_ns", "agA_id", "agA_name"]].rename( + columns={"agA_ns": "ns", "agA_id": "id", "agA_name": "name"} + ), + self.df[["agB_ns", "agB_id", "agB_name"]].rename( + columns={"agB_ns": "ns", "agB_id": "id", "agB_name": "name"} + ), + ], + ignore_index=True, + ).drop_duplicates() + for db_ns, db_id, name in df.values: + yield Node(db_ns, db_id, ["BioEntity"], dict(name=name)) def get_relations(self): # noqa:D102 - columns = ["A", "B", "stmt_type", "evidence_count", "stmt_hash"] - for source, target, stmt_type, ev_count, stmt_hash in ( + columns = [ + "agA_ns", + "agA_id", + "agB_ns", + "agB_id", + "stmt_type", + "source_counts", + "stmt_hash", + ] + for ( + source_ns, + source_id, + target_ns, + target_id, + stmt_type, + source_counts, + stmt_hash, + ) in ( self.df[columns].drop_duplicates().values ): - data = {"stmt_hash:long": stmt_hash, "evidence_count:long": ev_count} - yield Relation(source, target, [stmt_type], data) + data = {"stmt_hash:long": stmt_hash, "source_counts:string": source_counts} + yield Relation( + source_ns, + source_id, + target_ns, + target_id, + stmt_type, + data, + ) + + +def fix_id(db_ns: str, db_id: str) -> Tuple[str, str]: + """Fix ID issues specific to the SIF dump.""" + if db_ns == "GO": + if db_id.isnumeric(): + db_id = "0" * (7 - len(db_id)) + db_id + if db_ns == "EFO" and db_id.startswith("EFO:"): + db_id = db_id[4:] + if db_ns == "UP" and db_id.startswith("SL"): + db_ns = "UPLOC" + if db_ns == "UP" and "-" in db_id and not db_id.startswith("SL-"): + db_id = db_id.split("-")[0] + db_id = ensure_prefix_if_needed(db_ns, db_id) + return db_ns, db_id diff --git a/src/indra_cogex/sources/indra_ontology/__init__.py b/src/indra_cogex/sources/indra_ontology/__init__.py index 928a7919f..b980dcbbb 100644 --- a/src/indra_cogex/sources/indra_ontology/__init__.py +++ b/src/indra_cogex/sources/indra_ontology/__init__.py @@ -34,17 +34,13 @@ def __init__(self, ontology: Optional[IndraOntology] = None): def get_nodes(self): # noqa:D102 for node, data in self.ontology.nodes(data=True): - yield Node(_norm(node), ["BioEntity"], data) + db_ns, db_id = self.ontology.get_ns_id(node) + yield Node(db_ns, db_id, ["BioEntity"], data) def get_relations(self): # noqa:D102 for source, target, data in self.ontology.edges(data=True): + source_ns, source_id = self.ontology.get_ns_id(source) + target_ns, target_id = self.ontology.get_ns_id(target) data = copy.copy(data) edge_type = data.pop("type") - yield Relation(_norm(source), _norm(target), [edge_type], data) - - -def _norm(node: str) -> str: - ns, identifier = node.split(":", 1) - if identifier.startswith(f"{ns}:"): - identifier = identifier[len(ns) + 1 :] - return f"{ns}:{identifier}" + yield Relation(source_ns, source_id, target_ns, target_id, edge_type, data) diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py index 86d798c3c..fb4e59ade 100644 --- a/src/indra_cogex/sources/pathways/__init__.py +++ b/src/indra_cogex/sources/pathways/__init__.py @@ -5,6 +5,9 @@ import logging from typing import ClassVar +from indra.databases import hgnc_client +from indra.databases import uniprot_client +from indra.databases.identifiers import get_ns_id_from_identifiers import pyobo import pyobo.api.utils from pyobo.struct import has_part @@ -30,9 +33,11 @@ class PyoboProcessor(Processor): def get_nodes(self): # noqa:D102 # TODO add license version = pyobo.api.utils.get_version(self.prefix) - for identifier, name in pyobo.get_id_name_mapping("wikipathways").items(): + for identifier, name in pyobo.get_id_name_mapping(self.prefix).items(): + db_ns, db_id = get_ns_id_from_identifiers(self.prefix, identifier) yield Node( - f"{self.prefix}:{identifier}", + db_ns, + db_id, ["BioEntity"], dict(name=name, version=version), ) @@ -40,12 +45,38 @@ def get_nodes(self): # noqa:D102 def get_relations(self): # noqa:D102 df = pyobo.get_filtered_relations_df(self.prefix, self.relation) for identifier, t_prefix, t_identifier in df.values: + pathway_ns, pathway_id = get_ns_id_from_identifiers(self.prefix, identifier) + gene_ns, gene_id = self.get_gene(t_prefix, t_identifier) + if not gene_ns: + continue yield Relation( - f"{self.prefix}:{identifier}", - f"{t_prefix}:{t_identifier}", - [self.relation_label], + pathway_ns, + pathway_id, + gene_ns, + gene_id, + self.relation_label, + dict(source=self.name), ) + def get_gene(self, prefix, identifier): + if prefix == "ncbigene": + hgnc_id = hgnc_client.get_hgnc_from_entrez(identifier) + if hgnc_id: + return "HGNC", hgnc_id + else: + return "EGID", identifier + elif prefix == "uniprot": + # Some of the UniProt IDs are isoforms, for now, we just strip + # these off. We could do something more principled later. + if "-" in identifier: + identifier, _ = identifier.split("-") + hgnc_id = uniprot_client.get_hgnc_id(identifier) + if hgnc_id: + return "HGNC", hgnc_id + else: + return "UP", identifier + return None, None + class WikipathwaysProcessor(PyoboProcessor): """Processor for WikiPathways gene-pathway links.""" @@ -65,7 +96,3 @@ class ReactomeProcessor(PyoboProcessor): relation = has_part relation_label = "haspart" importable = True - - -if __name__ == "__main__": - WikipathwaysProcessor.cli() diff --git a/src/indra_cogex/sources/pathways/__main__.py b/src/indra_cogex/sources/pathways/__main__.py new file mode 100644 index 000000000..2e9b46bff --- /dev/null +++ b/src/indra_cogex/sources/pathways/__main__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- + +"""Run the pathways processor using ``python -m indra_cogex.sources.pathways``.""" + +from . import ReactomeProcessor, WikipathwaysProcessor + +if __name__ == "__main__": + ReactomeProcessor.cli() + WikipathwaysProcessor.cli() diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py index 90d64fece..1d4d57974 100644 --- a/src/indra_cogex/sources/processor.py +++ b/src/indra_cogex/sources/processor.py @@ -4,8 +4,8 @@ import csv import gzip +import logging from abc import ABC, abstractmethod -from operator import attrgetter from pathlib import Path from typing import ClassVar, Iterable @@ -14,12 +14,16 @@ from more_click import verbose_option from tqdm import tqdm +from indra.databases import identifiers +from indra.statements.validate import assert_valid_db_refs + from indra_cogex.representation import Node, Relation __all__ = [ "Processor", ] +logger = logging.getLogger(__name__) # deal with importing from wherever with # https://stackoverflow.com/questions/36922843/neo4j-3-x-load-csv-absolute-file-path @@ -76,49 +80,37 @@ def _dump_nodes(self) -> Path: if self.nodes_path.is_file(): return self.nodes_path - nodes = sorted(self.get_nodes(), key=attrgetter("identifier")) + nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id)) + return self._dump_nodes_to_path(nodes, self.nodes_path, sample_path) + + @staticmethod + def _dump_nodes_to_path(nodes, nodes_path, sample_path=None): + nodes = list(validate_nodes(nodes)) metadata = sorted(set(key for node in nodes for key in node.data)) node_rows = ( ( - node.identifier, - "|".join(node.labels), + norm_id(node.db_ns, node.db_id), + ";".join(node.labels), *[node.data.get(key, "") for key in metadata], ) for node in tqdm(nodes, desc="Nodes", unit_scale=True) ) - with gzip.open(self.nodes_path, mode="wt") as node_file: + header = "id:ID", ":LABEL", *metadata + with gzip.open(nodes_path, mode="wt") as node_file: node_writer = csv.writer(node_file, delimiter="\t") # type: ignore - with sample_path.open("w") as node_sample_file: - node_sample_writer = csv.writer(node_sample_file, delimiter="\t") - - header = "id:ID", ":LABEL", *metadata - node_sample_writer.writerow(header) - node_writer.writerow(header) - - for _, node_row in zip(range(10), node_rows): - node_sample_writer.writerow(node_row) - node_writer.writerow(node_row) - + node_writer.writerow(header) + if sample_path: + with sample_path.open("w") as node_sample_file: + node_sample_writer = csv.writer(node_sample_file, delimiter="\t") + node_sample_writer.writerow(header) + for _, node_row in zip(range(10), node_rows): + node_sample_writer.writerow(node_row) + node_writer.writerow(node_row) # Write remaining nodes node_writer.writerows(node_rows) - # cypher = dedent(f'''\ - # CREATE CONSTRAINT ON (n:{ntype}) ASSERT n.id IS UNIQUE; - # USING PERIODIC COMMIT - # LOAD CSV WITH HEADERS FROM "file://{data_path.as_posix()}" AS row FIELDTERMINATOR '\\t' - # MERGE (n:{ntype} {{ id: row.identifier }}) - # ''') - # if metadata: - # creates = '\n'.join( - # f'n.{key} = row.{key}' - # for key in metadata - # ) - # cypher += f'ON CREATE SET {creates}' - # with cypher_path.open('w') as file: - # print(cypher, file=file) - - return self.nodes_path + return nodes_path def _dump_edges(self) -> Path: sample_path = self.module.join(name="edges_sample.tsv") @@ -126,13 +118,16 @@ def _dump_edges(self) -> Path: return self.edges_path rels = self.get_relations() - rels = sorted(rels, key=lambda r: (r.source_id, r.target_id)) + rels = validate_relations(rels) + rels = sorted( + rels, key=lambda r: (r.source_ns, r.source_id, r.target_ns, r.target_id) + ) metadata = sorted(set(key for rel in rels for key in rel.data)) edge_rows = ( ( - rel.source_id, - rel.target_id, - "|".join(sorted(rel.labels)), + norm_id(rel.source_ns, rel.source_id), + norm_id(rel.target_ns, rel.target_id), + rel.rel_type, *[rel.data.get(key) for key in metadata], ) for rel in tqdm(rels, desc="Edges", unit_scale=True) @@ -153,3 +148,38 @@ def _dump_edges(self) -> Path: # Write remaining edges edge_writer.writerows(edge_rows) return self.edges_path + + +def norm_id(db_ns, db_id): + identifiers_ns = identifiers.get_identifiers_ns(db_ns) + identifiers_id = db_id + if not identifiers_ns: + identifiers_ns = db_ns.lower() + else: + ns_embedded = identifiers.identifiers_registry.get(identifiers_ns, {}).get( + "namespace_embedded" + ) + if ns_embedded: + identifiers_id = identifiers_id[len(identifiers_ns) + 1 :] + return f"{identifiers_ns}:{identifiers_id}" + + +def validate_nodes(nodes): + for idx, node in enumerate(nodes): + try: + assert_valid_db_refs({node.db_ns: node.db_id}) + yield node + except Exception as e: + logger.info(f"{idx}: {node} - {e}") + continue + + +def validate_relations(relations): + for idx, rel in enumerate(relations): + try: + assert_valid_db_refs({rel.source_ns: rel.source_id}) + assert_valid_db_refs({rel.target_ns: rel.target_id}) + yield rel + except Exception as e: + logger.info(f"{idx}: {rel} - {e}") + continue diff --git a/tests/test_assembly.py b/tests/test_assembly.py new file mode 100644 index 000000000..e6185c38d --- /dev/null +++ b/tests/test_assembly.py @@ -0,0 +1,40 @@ +from indra_cogex.assembly import NodeAssembler +from indra_cogex.representation import Node + + +def test_add_nodes(): + na = NodeAssembler([Node("x", "y", ["l"])]) + assert len(na.nodes) == 1 + na.add_nodes([Node("y", "z", ["l"])]) + assert len(na.nodes) == 2 + + +def test_merge_properties(): + n1 = Node("ns", "id", ["l"], {"k1": "v1"}) + n2 = Node("ns", "id", ["l"], {"k2": "v2"}) + na = NodeAssembler([n1, n2]) + ans = na.assemble_nodes() + assert len(ans) == 1 + assert ans[0].data == {"k1": "v1", "k2": "v2"} + + +def test_merge_labels(): + n1 = Node("ns", "id", ["l1", "l2"]) + n2 = Node("ns", "id", ["l2", "l3"]) + na = NodeAssembler([n1, n2]) + ans = na.assemble_nodes() + assert len(ans) == 1 + assert set(ans[0].labels) == {"l1", "l2", "l3"} + + +def test_merge_conflict(): + n1 = Node("ns", "id", ["l"], {"k1": "v1"}) + n2 = Node("ns", "id", ["l"], {"k1": "v2"}) + na = NodeAssembler([n1, n2]) + ans = na.assemble_nodes() + assert len(ans) == 1 + assert ans[0].data == {"k1": "v1"} + assert len(na.conflicts) == 1 + assert na.conflicts[0].key == "k1" + assert na.conflicts[0].val1 == "v1" + assert na.conflicts[0].val2 == "v2" diff --git a/tests/test_indra_db.py b/tests/test_indra_db.py new file mode 100644 index 000000000..9172c1e45 --- /dev/null +++ b/tests/test_indra_db.py @@ -0,0 +1,9 @@ +from indra_cogex.sources.indra_db import fix_id + + +def test_fix_id(): + assert fix_id("EFO", "EFO:12345") == ("EFO", "12345") + assert fix_id("GO", "123") == ("GO", "GO:0000123") + assert fix_id("CHEBI", "123") == ("CHEBI", "CHEBI:123") + assert fix_id("UP", "P12345-6") == ("UP", "P12345") + assert fix_id("UP", "SL-123") == ("UPLOC", "SL-123") diff --git a/tests/test_processor.py b/tests/test_processor.py new file mode 100644 index 000000000..6abaa4c44 --- /dev/null +++ b/tests/test_processor.py @@ -0,0 +1,6 @@ +from indra_cogex.sources.processor import norm_id + + +def test_norm_id(): + assert norm_id("UP", "P12345") == "uniprot:P12345" + assert norm_id("CHEBI", "CHEBI:12345") == "chebi:12345" diff --git a/tox.ini b/tox.ini index 7273319ac..b4bcc7215 100644 --- a/tox.ini +++ b/tox.ini @@ -28,8 +28,6 @@ passenv = deps = coverage pytest -extras = - pandas whitelist_externals = /bin/cat /bin/cp @@ -79,7 +77,7 @@ description = Run the flake8 tool with several plugins (bandit, docstrings, impo [testenv:black] deps = black skip_install = true -commands = black src/ tests/ setup.py +commands = black src/indra_cogex tests/ setup.py description = Run the black tool [testenv:mypy]