From 0acc17d7dadd3efc09c53634060ed62ef38f5881 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 11:31:20 -0400 Subject: [PATCH 01/34] Implement initial node assembler --- src/indra_cogex/assembly/__init__.py | 50 ++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/indra_cogex/assembly/__init__.py diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py new file mode 100644 index 000000000..dcf9b88a3 --- /dev/null +++ b/src/indra_cogex/assembly/__init__.py @@ -0,0 +1,50 @@ +from collections import defaultdict +from typing import List +from indra_cogex.representation import Node + + +class NodeAssembler: + def __init__(self, nodes: List[Node]): + self.nodes = nodes + self.conflicts = [] + + def add_nodes(self, nodes: List[Node]): + self.nodes += nodes + + def assemble_nodes(self) -> List[Node]: + nodes_by_id = defaultdict(list) + for node in self.nodes: + nodes_by_id[node.identifier].append(node) + + assembled_nodes = [ + self.get_aggregate_node(identifier, node_group) + for identifier, node_group in nodes_by_id.items() + ] + return assembled_nodes + + def get_aggregate_node(self, identifier: str, nodes: List[Node]) -> Node: + labels = set() + data = {} + for node in nodes: + labels |= node.labels + for data_key, data_val in node.data.items(): + previous_val = data.get(data_key) + if previous_val and previous_val != data_val: + self.conflicts.append( + Conflict(f"{data_key}:{previous_val}"), f"{data_key}:{data_val}" + ) + else: + data[data_key] = data_val + return Node(identifier, labels, data) + + +class Conflict: + def __init__(self, first, second): + self.first = first + self.second = second + + def __repr__(self): + return str(self) + + def __str__(self): + return f"Conflict({self.first}, {self.second})" From 1fe8a0c5f92a765b1144ef6ba8be8a0c2188d097 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 11:32:45 -0400 Subject: [PATCH 02/34] Split namespace and ID in representation --- src/indra_cogex/representation.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py index 0f558509f..8e24e0a53 100644 --- a/src/indra_cogex/representation.py +++ b/src/indra_cogex/representation.py @@ -12,28 +12,39 @@ class Node: def __init__( self, - identifier: str, + db_ns: str, + db_id: str, labels: Collection[str], data: Optional[Mapping[str, Any]] = None, ): """Initialize the node. - :param identifier: The identifier of the node - :param labels: The collection of labels for the relation. - :param data: The optional data dictionary associated with the node. + Parameters + ---------- + db_ns : + The namespace associated with the node. Uses the INDRA standard. + db_id : + The identifier within the namespace associated with the node. + Uses the INDRA standard. + labels : + A collection of labels for the node. + data : + An optional data dictionary associated with the node. """ - self.identifier = identifier + self.db_ns = db_ns + self.db_id = db_id self.labels = labels self.data = data if data else {} def to_json(self): """Serialize the node to JSON.""" data = {k: v for k, v in self.data.items()} - data["id"] = self.identifier + data["db_ns"] = self.db_ns + data["db_id"] = self.db_id return {"labels": self.labels, "data": data} def _get_data_str(self): - pieces = ["id:'%s'" % self.identifier] + pieces = ["id:'%s:%s'" % (self.db_ns, self.db_id)] for k, v in self.data.items(): if isinstance(v, str): value = "'" + v.replace("'", "\\'") + "'" @@ -60,7 +71,9 @@ class Relation: def __init__( self, + source_ns: str, source_id: str, + target_ns: str, target_id: str, labels: Collection[str], data: Optional[Mapping[str, Any]] = None, @@ -72,7 +85,9 @@ def __init__( :param labels: The collection of labels for the relation. :param data: The optional data dictionary associated with the relation. """ + self.source_ns = source_ns self.source_id = source_id + self.target_ns = target_ns self.target_id = target_id self.labels = list(labels) self.data = data if data else {} @@ -80,7 +95,9 @@ def __init__( def to_json(self): """Serialize the relation to JSON.""" return { + "source_ns": self.source_ns, "source_id": self.source_id, + "target_ns": self.target_ns, "target_id": self.target_id, "labels": self.labels, "data": self.data, From 9c1d8e6a490147b8fe7faa5cde3d4967da1a9f3c Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 11:48:04 -0400 Subject: [PATCH 03/34] Update pathway source identifiers --- src/indra_cogex/sources/pathways/__init__.py | 33 ++++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py index 86d798c3c..770073a48 100644 --- a/src/indra_cogex/sources/pathways/__init__.py +++ b/src/indra_cogex/sources/pathways/__init__.py @@ -5,6 +5,9 @@ import logging from typing import ClassVar +from indra.databases import hgnc_client +from indra.databases import uniprot_client +from indra.databases.identifiers import get_ns_id_from_identifiers import pyobo import pyobo.api.utils from pyobo.struct import has_part @@ -31,8 +34,10 @@ def get_nodes(self): # noqa:D102 # TODO add license version = pyobo.api.utils.get_version(self.prefix) for identifier, name in pyobo.get_id_name_mapping("wikipathways").items(): + db_ns, db_id = get_ns_id_from_identifiers(self.prefix, identifier) yield Node( - f"{self.prefix}:{identifier}", + db_ns, + db_id, ["BioEntity"], dict(name=name, version=version), ) @@ -40,12 +45,34 @@ def get_nodes(self): # noqa:D102 def get_relations(self): # noqa:D102 df = pyobo.get_filtered_relations_df(self.prefix, self.relation) for identifier, t_prefix, t_identifier in df.values: + pathway_ns, pathway_id = get_ns_id_from_identifiers(self.prefix, identifier) + gene_ns, gene_id = self.get_gene(t_prefix, t_identifier) + if not gene_ns: + continue yield Relation( - f"{self.prefix}:{identifier}", - f"{t_prefix}:{t_identifier}", + pathway_ns, + pathway_id, + gene_ns, + gene_id, [self.relation_label], + dict(source=self.name), ) + def get_gene(self, prefix, identifier): + if prefix == "ncbigene": + hgnc_id = hgnc_client.get_hgnc_from_entrez(identifier) + if hgnc_id: + return "HGNC", hgnc_id + else: + return "EGID", identifier + elif prefix == "uniprot": + hgnc_id = uniprot_client.get_hgnc_id(identifier) + if hgnc_id: + return "HGNC", hgnc_id + else: + return "UP", identifier + return None, None + class WikipathwaysProcessor(PyoboProcessor): """Processor for WikiPathways gene-pathway links.""" From ab08e7d290b66eacf1278fe309f175b5b5858241 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 11:48:43 -0400 Subject: [PATCH 04/34] Update namespace/ID standards in sources --- src/indra_cogex/sources/bgee/__init__.py | 13 +++++++++---- src/indra_cogex/sources/goa/__init__.py | 9 +++------ .../sources/indra_ontology/__init__.py | 16 +++++++--------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py index 0eb6c2dfc..3983d3161 100644 --- a/src/indra_cogex/sources/bgee/__init__.py +++ b/src/indra_cogex/sources/bgee/__init__.py @@ -34,18 +34,23 @@ def __init__(self, path: Union[None, str, Path] = None): def get_nodes(self): # noqa:D102 for context_id in self.expressions: yield Node( - context_id, + *context_id.split(":", maxsplit=1), ["BioEntity"], data={"name": pyobo.get_name_by_curie(context_id)}, ) for hgnc_id in set.union(*[set(v) for v in self.expressions.values()]): yield Node( - f"HGNC:{hgnc_id}", + "HGNC", + hgnc_id, ["BioEntity"], data={"name": pyobo.get_name("hgnc", hgnc_id)}, ) def get_relations(self): # noqa:D102 - for context_id, hgnc_ids in self.expressions.items(): + data = {"source": self.name} + for context, hgnc_ids in self.expressions.items(): + context_ns, context_id = context.split(":", maxsplit=1) for hgnc_id in hgnc_ids: - yield Relation(f"HGNC:{hgnc_id}", context_id, [self.rel_type]) + yield Relation( + "HGNC", hgnc_id, context_ns, context_id, [self.rel_type], data + ) diff --git a/src/indra_cogex/sources/goa/__init__.py b/src/indra_cogex/sources/goa/__init__.py index f67fe2f1d..bff2452ea 100644 --- a/src/indra_cogex/sources/goa/__init__.py +++ b/src/indra_cogex/sources/goa/__init__.py @@ -44,18 +44,15 @@ def get_nodes(self): # noqa:D102 for go_node in self.df["GO_ID"].unique(): yield Node(go_node, ["BioEntity"]) for hgnc_id in self.df["HGNC_ID"].unique(): - yield Node(f"HGNC:{hgnc_id}", ["BioEntity"]) + yield Node("HGNC", hgnc_id, ["BioEntity"]) def get_relations(self): # noqa:D102 rel_type = "associated_with" for (go_id, hgnc_id), ecs in self.df.groupby(["GO_ID", "HGNC_ID"])["EC"]: all_ecs = ",".join(sorted(set(ecs))) - source = f"HGNC:{hgnc_id}" - # Note that we don't add the extra GO: by current convention - target = go_id # Possible properties could be e.g., evidence codes - data = {"evidence_codes:string": all_ecs} - yield Relation(source, target, [rel_type], data) + data = {"evidence_codes:string": all_ecs, "source": self.name} + yield Relation("HGNC", hgnc_id, "GO", go_id, [rel_type], data) def load_goa(url: str) -> pd.DataFrame: diff --git a/src/indra_cogex/sources/indra_ontology/__init__.py b/src/indra_cogex/sources/indra_ontology/__init__.py index 928a7919f..8f69957f1 100644 --- a/src/indra_cogex/sources/indra_ontology/__init__.py +++ b/src/indra_cogex/sources/indra_ontology/__init__.py @@ -34,17 +34,15 @@ def __init__(self, ontology: Optional[IndraOntology] = None): def get_nodes(self): # noqa:D102 for node, data in self.ontology.nodes(data=True): - yield Node(_norm(node), ["BioEntity"], data) + db_ns, db_id = self.ontology.get_ns_id(node) + yield Node(db_ns, db_id, ["BioEntity"], data) def get_relations(self): # noqa:D102 for source, target, data in self.ontology.edges(data=True): + source_ns, source_id = self.ontology.get_ns_id(source) + target_ns, target_id = self.ontology.get_ns_id(target) data = copy.copy(data) edge_type = data.pop("type") - yield Relation(_norm(source), _norm(target), [edge_type], data) - - -def _norm(node: str) -> str: - ns, identifier = node.split(":", 1) - if identifier.startswith(f"{ns}:"): - identifier = identifier[len(ns) + 1 :] - return f"{ns}:{identifier}" + yield Relation( + source_ns, source_id, target_ns, target_id, [edge_type], data + ) From 2a92bed18a630fb3a9f9a40f4ef3968238a36e53 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 15:02:07 -0400 Subject: [PATCH 05/34] Restructure node generation --- src/indra_cogex/sources/indra_db/__init__.py | 34 ++++++++------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py index db0d4fb01..3f52c843c 100644 --- a/src/indra_cogex/sources/indra_db/__init__.py +++ b/src/indra_cogex/sources/indra_db/__init__.py @@ -42,12 +42,6 @@ def __init__(self, path: Union[None, str, Path] = None): logger.info("Loaded %s rows from %s", humanize.intword(len(df)), path) self.df = df for side in "AB": - self.df[side] = [ - f"{prefix}:{identifier}" - for prefix, identifier in self.df[ - [f"ag{side}_ns", f"ag{side}_id"] - ].values - ] # A lot of the names in the SIF dump are all over self.df[f"ag{side}_name"] = [ bio_ontology.get_name(prefix, identifier) @@ -57,21 +51,19 @@ def __init__(self, path: Union[None, str, Path] = None): ] def get_nodes(self): # noqa:D102 - df = ( - pd.concat([self._get_nodes("A"), self._get_nodes("B")], ignore_index=True) - .drop_duplicates() - .sort_values("curie") - ) - for curie, name in df.values: - yield Node(curie, ["BioEntity"], dict(name=name)) - - def _get_nodes(self, side: str) -> pd.DataFrame: - return self.df[[side, f"ag{side}_name"]].rename( - columns={ - side: "curie", - f"ag{side}_name": "name", - } - ) + df = pd.concat( + [ + self.df[["agA_ns", "agA_id", "agA_name"]].rename( + {"agA_ns": "ns", "agA_id": "id", "agA_name": "name"} + ), + self.df[["agB_ns", "agB_id", "agB_name"]].rename( + {"agB_ns": "ns", "agB_id": "id", "agB_name": "name"} + ), + ], + ignore_index=True, + ).drop_duplicates() + for db_ns, db_id, name in df.values: + yield Node(db_ns, db_id, ["BioEntity"], dict(name=name)) def get_relations(self): # noqa:D102 columns = ["A", "B", "stmt_type", "evidence_count", "stmt_hash"] From 2e489024634f92e89d387f2ba07fa5515303c64f Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 15:16:07 -0400 Subject: [PATCH 06/34] Restructure relation generation --- src/indra_cogex/sources/indra_db/__init__.py | 31 +++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py index 3f52c843c..a943f965d 100644 --- a/src/indra_cogex/sources/indra_db/__init__.py +++ b/src/indra_cogex/sources/indra_db/__init__.py @@ -66,9 +66,32 @@ def get_nodes(self): # noqa:D102 yield Node(db_ns, db_id, ["BioEntity"], dict(name=name)) def get_relations(self): # noqa:D102 - columns = ["A", "B", "stmt_type", "evidence_count", "stmt_hash"] - for source, target, stmt_type, ev_count, stmt_hash in ( + columns = [ + "agA_ns", + "agA_id", + "agB_ns", + "agB_id", + "stmt_type", + "source_counts", + "stmt_hash", + ] + for ( + source_ns, + source_id, + target_ns, + target_id, + stmt_type, + source_counts, + stmt_hash, + ) in ( self.df[columns].drop_duplicates().values ): - data = {"stmt_hash:long": stmt_hash, "evidence_count:long": ev_count} - yield Relation(source, target, [stmt_type], data) + data = {"stmt_hash:long": stmt_hash, "evidence_count:str": source_counts} + yield Relation( + source_ns, + source_id, + target_ns, + target_id, + [stmt_type, "Statement"], + data, + ) From e554bcf98e35a350e074cfd6a2ed55bc90c5b88c Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 15:27:28 -0400 Subject: [PATCH 07/34] Reformat processor with new ID structure --- src/indra_cogex/sources/processor.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py index 90d64fece..4eb743ca9 100644 --- a/src/indra_cogex/sources/processor.py +++ b/src/indra_cogex/sources/processor.py @@ -14,6 +14,8 @@ from more_click import verbose_option from tqdm import tqdm +from indra.databases import identifiers + from indra_cogex.representation import Node, Relation __all__ = [ @@ -76,11 +78,11 @@ def _dump_nodes(self) -> Path: if self.nodes_path.is_file(): return self.nodes_path - nodes = sorted(self.get_nodes(), key=attrgetter("identifier")) + nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id)) metadata = sorted(set(key for node in nodes for key in node.data)) node_rows = ( ( - node.identifier, + norm_id(node.db_ns, node.db_id), "|".join(node.labels), *[node.data.get(key, "") for key in metadata], ) @@ -126,12 +128,14 @@ def _dump_edges(self) -> Path: return self.edges_path rels = self.get_relations() - rels = sorted(rels, key=lambda r: (r.source_id, r.target_id)) + rels = sorted( + rels, key=lambda r: (r.source_ns, r.source_id, r.target_ns, r.target_id) + ) metadata = sorted(set(key for rel in rels for key in rel.data)) edge_rows = ( ( - rel.source_id, - rel.target_id, + norm_id(rel.source_ns, rel.source_id), + norm_id(rel.target_ns, rel.target_id), "|".join(sorted(rel.labels)), *[rel.data.get(key) for key in metadata], ) @@ -153,3 +157,14 @@ def _dump_edges(self) -> Path: # Write remaining edges edge_writer.writerows(edge_rows) return self.edges_path + + +def norm_id(db_ns, db_id): + identifiers_ns = identifiers.get_identifiers_ns(db_ns) + identifiers_id = db_id + ns_embedded = identifiers.identifiers_registry.get(identifiers_ns, {}).get( + "namespace_embedded" + ) + if ns_embedded: + identifiers_id = identifiers_id[len(identifiers_ns) :] + return f"{identifiers_ns}:{identifiers_id}" From 1eee55aae478188165d4c2f9da525a6a4db5e217 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 15:34:03 -0400 Subject: [PATCH 08/34] Test ID normalization --- src/indra_cogex/sources/processor.py | 2 +- tests/test_processor.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 tests/test_processor.py diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py index 4eb743ca9..d22408056 100644 --- a/src/indra_cogex/sources/processor.py +++ b/src/indra_cogex/sources/processor.py @@ -166,5 +166,5 @@ def norm_id(db_ns, db_id): "namespace_embedded" ) if ns_embedded: - identifiers_id = identifiers_id[len(identifiers_ns) :] + identifiers_id = identifiers_id[len(identifiers_ns) + 1 :] return f"{identifiers_ns}:{identifiers_id}" diff --git a/tests/test_processor.py b/tests/test_processor.py new file mode 100644 index 000000000..6abaa4c44 --- /dev/null +++ b/tests/test_processor.py @@ -0,0 +1,6 @@ +from indra_cogex.sources.processor import norm_id + + +def test_norm_id(): + assert norm_id("UP", "P12345") == "uniprot:P12345" + assert norm_id("CHEBI", "CHEBI:12345") == "chebi:12345" From 5950634a19a9c540db2c33ecdd6f25ef97e1f946 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 15:54:17 -0400 Subject: [PATCH 09/34] Implement node assembly upon import --- src/indra_cogex/sources/cli.py | 16 +++++++++-- src/indra_cogex/sources/processor.py | 41 ++++++++++------------------ 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py index bdc4b0014..8e892d087 100644 --- a/src/indra_cogex/sources/cli.py +++ b/src/indra_cogex/sources/cli.py @@ -6,9 +6,12 @@ from textwrap import dedent import click +import pystow from more_click import verbose_option from . import processor_resolver +from .processor import Processor +from ..assembly import NodeAssembler @click.command() @@ -32,6 +35,7 @@ def main(load: bool, load_only: bool, force: bool): """Generate and import Neo4j nodes and edges tables.""" paths = [] + na = NodeAssembler() for processor_cls in processor_resolver: if not processor_cls.importable: continue @@ -44,9 +48,17 @@ def main(load: bool, load_only: bool, force: bool): ): click.secho("Processing...", fg="green") processor = processor_cls() + na.add_nodes(list(processor.get_nodes())) processor.dump() paths.append((processor_cls.nodes_path, processor_cls.edges_path)) + # Now create and dump the assembled nodes + assembled_nodes = na.assemble_nodes() + assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id)) + metadata = sorted(set(key for node in assembled_nodes for key in node.data)) + nodes_path = pystow.module("indra", "cogex", "assembled").join(name="nodes.tsv.gz") + Processor._dump_nodes_to_path(assembled_nodes, metadata, nodes_path) + if load or load_only: command = dedent( """\ @@ -57,8 +69,8 @@ def main(load: bool, load_only: bool, force: bool): --skip-bad-relationships=true """ ).rstrip() - for node_path, edge_path in paths: - command += f"\\\n --nodes {node_path} \\\n --relationships {edge_path}" + for _, edge_path in paths: + command += f"\\\n --relationships {edge_path}" click.secho("Running shell command:") click.secho(command, fg="blue") diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py index d22408056..6d3dc237b 100644 --- a/src/indra_cogex/sources/processor.py +++ b/src/indra_cogex/sources/processor.py @@ -80,6 +80,10 @@ def _dump_nodes(self) -> Path: nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id)) metadata = sorted(set(key for node in nodes for key in node.data)) + self._dump_nodes_to_path(nodes, metadata, self.nodes_path, sample_path) + + @staticmethod + def _dump_nodes_to_path(nodes, metadata, nodes_path, sample_path=None): node_rows = ( ( norm_id(node.db_ns, node.db_id), @@ -89,38 +93,23 @@ def _dump_nodes(self) -> Path: for node in tqdm(nodes, desc="Nodes", unit_scale=True) ) - with gzip.open(self.nodes_path, mode="wt") as node_file: + with gzip.open(nodes_path, mode="wt") as node_file: node_writer = csv.writer(node_file, delimiter="\t") # type: ignore - with sample_path.open("w") as node_sample_file: - node_sample_writer = csv.writer(node_sample_file, delimiter="\t") - - header = "id:ID", ":LABEL", *metadata - node_sample_writer.writerow(header) - node_writer.writerow(header) + if sample_path: + with sample_path.open("w") as node_sample_file: + node_sample_writer = csv.writer(node_sample_file, delimiter="\t") - for _, node_row in zip(range(10), node_rows): - node_sample_writer.writerow(node_row) - node_writer.writerow(node_row) + header = "id:ID", ":LABEL", *metadata + node_sample_writer.writerow(header) + node_writer.writerow(header) + for _, node_row in zip(range(10), node_rows): + node_sample_writer.writerow(node_row) + node_writer.writerow(node_row) # Write remaining nodes node_writer.writerows(node_rows) - # cypher = dedent(f'''\ - # CREATE CONSTRAINT ON (n:{ntype}) ASSERT n.id IS UNIQUE; - # USING PERIODIC COMMIT - # LOAD CSV WITH HEADERS FROM "file://{data_path.as_posix()}" AS row FIELDTERMINATOR '\\t' - # MERGE (n:{ntype} {{ id: row.identifier }}) - # ''') - # if metadata: - # creates = '\n'.join( - # f'n.{key} = row.{key}' - # for key in metadata - # ) - # cypher += f'ON CREATE SET {creates}' - # with cypher_path.open('w') as file: - # print(cypher, file=file) - - return self.nodes_path + return nodes_path def _dump_edges(self) -> Path: sample_path = self.module.join(name="edges_sample.tsv") From 482ddd71b7d6d1e8319712e8b44b22ded8d09ebe Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 15:57:34 -0400 Subject: [PATCH 10/34] Use new representation in assembler --- src/indra_cogex/assembly/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py index dcf9b88a3..0bc63839b 100644 --- a/src/indra_cogex/assembly/__init__.py +++ b/src/indra_cogex/assembly/__init__.py @@ -14,15 +14,15 @@ def add_nodes(self, nodes: List[Node]): def assemble_nodes(self) -> List[Node]: nodes_by_id = defaultdict(list) for node in self.nodes: - nodes_by_id[node.identifier].append(node) + nodes_by_id[(node.db_ns, node.db_id)].append(node) assembled_nodes = [ - self.get_aggregate_node(identifier, node_group) - for identifier, node_group in nodes_by_id.items() + self.get_aggregate_node(db_ns, db_id, node_group) + for (db_ns, db_id), node_group in nodes_by_id.items() ] return assembled_nodes - def get_aggregate_node(self, identifier: str, nodes: List[Node]) -> Node: + def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node: labels = set() data = {} for node in nodes: @@ -35,7 +35,7 @@ def get_aggregate_node(self, identifier: str, nodes: List[Node]) -> Node: ) else: data[data_key] = data_val - return Node(identifier, labels, data) + return Node(db_ns, db_id, labels, data) class Conflict: From e36e9668beb276819be54589669d9f63671d44fd Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 16:29:54 -0400 Subject: [PATCH 11/34] Fix Bgee source identifiers --- src/indra_cogex/assembly/__init__.py | 6 +++--- src/indra_cogex/representation.py | 2 ++ src/indra_cogex/sources/bgee/__init__.py | 15 ++++++++++++--- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py index 0bc63839b..cd59194a2 100644 --- a/src/indra_cogex/assembly/__init__.py +++ b/src/indra_cogex/assembly/__init__.py @@ -1,11 +1,11 @@ from collections import defaultdict -from typing import List +from typing import List, Optional from indra_cogex.representation import Node class NodeAssembler: - def __init__(self, nodes: List[Node]): - self.nodes = nodes + def __init__(self, nodes: Optional[List[Node]] = None): + self.nodes = nodes if nodes else [] self.conflicts = [] def add_nodes(self, nodes: List[Node]): diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py index 8e24e0a53..64169eeec 100644 --- a/src/indra_cogex/representation.py +++ b/src/indra_cogex/representation.py @@ -31,6 +31,8 @@ def __init__( data : An optional data dictionary associated with the node. """ + if not db_ns or not db_id: + raise ValueError("Invalid namespace or ID.") self.db_ns = db_ns self.db_id = db_id self.labels = labels diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py index 3983d3161..188cd0e43 100644 --- a/src/indra_cogex/sources/bgee/__init__.py +++ b/src/indra_cogex/sources/bgee/__init__.py @@ -32,9 +32,11 @@ def __init__(self, path: Union[None, str, Path] = None): self.expressions = pickle.load(fh) def get_nodes(self): # noqa:D102 - for context_id in self.expressions: + for context in self.expressions: + context_ns, context_id = get_context(context) yield Node( - *context_id.split(":", maxsplit=1), + context_ns, + context_id, ["BioEntity"], data={"name": pyobo.get_name_by_curie(context_id)}, ) @@ -49,8 +51,15 @@ def get_nodes(self): # noqa:D102 def get_relations(self): # noqa:D102 data = {"source": self.name} for context, hgnc_ids in self.expressions.items(): - context_ns, context_id = context.split(":", maxsplit=1) + context_ns, context_id = get_context(context) for hgnc_id in hgnc_ids: yield Relation( "HGNC", hgnc_id, context_ns, context_id, [self.rel_type], data ) + + +def get_context(context): + context_ns, context_id = context.split(":", maxsplit=1) + if context_ns == "UBERON": + context_id = f"UBERON:{context_id}" + return context_ns, context_id From d6174fbad7d2dda2eb17e26ecc051e3359c9e0b0 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 16:32:59 -0400 Subject: [PATCH 12/34] Fix a corner case for identifiers mapping --- src/indra_cogex/sources/processor.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py index 6d3dc237b..9107fabf4 100644 --- a/src/indra_cogex/sources/processor.py +++ b/src/indra_cogex/sources/processor.py @@ -151,9 +151,12 @@ def _dump_edges(self) -> Path: def norm_id(db_ns, db_id): identifiers_ns = identifiers.get_identifiers_ns(db_ns) identifiers_id = db_id - ns_embedded = identifiers.identifiers_registry.get(identifiers_ns, {}).get( - "namespace_embedded" - ) - if ns_embedded: - identifiers_id = identifiers_id[len(identifiers_ns) + 1 :] + if not identifiers_ns: + identifiers_ns = db_ns.lower() + else: + ns_embedded = identifiers.identifiers_registry.get(identifiers_ns, {}).get( + "namespace_embedded" + ) + if ns_embedded: + identifiers_id = identifiers_id[len(identifiers_ns) + 1 :] return f"{identifiers_ns}:{identifiers_id}" From 104e0874870da72bcf131cbe7b86783d732b761f Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 19:20:23 -0400 Subject: [PATCH 13/34] Implement validation and fix Relation str --- src/indra_cogex/representation.py | 5 ++++- src/indra_cogex/sources/bgee/__init__.py | 2 ++ src/indra_cogex/sources/processor.py | 27 +++++++++++++++++++++++- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py index 64169eeec..7b5581d60 100644 --- a/src/indra_cogex/representation.py +++ b/src/indra_cogex/representation.py @@ -108,7 +108,10 @@ def to_json(self): def __str__(self): # noqa:D105 data_str = ", ".join(["%s:'%s'" % (k, v) for k, v in self.data.items()]) labels_str = ":".join(self.labels) - return f"({self.source_id})-[:{labels_str} {data_str}]->" f"({self.target_id})" + return ( + f"({self.source_ns}, {self.source_id})-[:{labels_str} {data_str}]->" + f"({self.target_ns}, {self.target_id})" + ) def __repr__(self): # noqa:D105 return str(self) diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py index 188cd0e43..e773490f9 100644 --- a/src/indra_cogex/sources/bgee/__init__.py +++ b/src/indra_cogex/sources/bgee/__init__.py @@ -62,4 +62,6 @@ def get_context(context): context_ns, context_id = context.split(":", maxsplit=1) if context_ns == "UBERON": context_id = f"UBERON:{context_id}" + elif context_ns == "CL": + context_id = f"CL:{context_id}" return context_ns, context_id diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py index 9107fabf4..ef5c28ee1 100644 --- a/src/indra_cogex/sources/processor.py +++ b/src/indra_cogex/sources/processor.py @@ -4,8 +4,8 @@ import csv import gzip +import logging from abc import ABC, abstractmethod -from operator import attrgetter from pathlib import Path from typing import ClassVar, Iterable @@ -15,6 +15,7 @@ from tqdm import tqdm from indra.databases import identifiers +from indra.statements.validate import assert_valid_db_refs from indra_cogex.representation import Node, Relation @@ -22,6 +23,7 @@ "Processor", ] +logger = logging.getLogger(__name__) # deal with importing from wherever with # https://stackoverflow.com/questions/36922843/neo4j-3-x-load-csv-absolute-file-path @@ -84,6 +86,7 @@ def _dump_nodes(self) -> Path: @staticmethod def _dump_nodes_to_path(nodes, metadata, nodes_path, sample_path=None): + nodes = list(validate_nodes(nodes)) node_rows = ( ( norm_id(node.db_ns, node.db_id), @@ -117,6 +120,7 @@ def _dump_edges(self) -> Path: return self.edges_path rels = self.get_relations() + rels = validate_relations(rels) rels = sorted( rels, key=lambda r: (r.source_ns, r.source_id, r.target_ns, r.target_id) ) @@ -160,3 +164,24 @@ def norm_id(db_ns, db_id): if ns_embedded: identifiers_id = identifiers_id[len(identifiers_ns) + 1 :] return f"{identifiers_ns}:{identifiers_id}" + + +def validate_nodes(nodes): + for idx, node in enumerate(nodes): + try: + assert_valid_db_refs({node.db_ns: node.db_id}) + yield node + except Exception as e: + logger.info(f"{idx}: {node} - {e}") + continue + + +def validate_relations(relations): + for idx, rel in enumerate(relations): + try: + assert_valid_db_refs({rel.source_ns: rel.source_id}) + assert_valid_db_refs({rel.target_ns: rel.target_id}) + yield rel + except Exception as e: + logger.info(f"{idx}: {rel} - {e}") + continue From 877e41d0dfd425625854f3a3fc34c4026c70f542 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 19:23:45 -0400 Subject: [PATCH 14/34] Fix GO node construction --- src/indra_cogex/sources/goa/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indra_cogex/sources/goa/__init__.py b/src/indra_cogex/sources/goa/__init__.py index bff2452ea..a97155d88 100644 --- a/src/indra_cogex/sources/goa/__init__.py +++ b/src/indra_cogex/sources/goa/__init__.py @@ -42,7 +42,7 @@ def __init__(self): def get_nodes(self): # noqa:D102 for go_node in self.df["GO_ID"].unique(): - yield Node(go_node, ["BioEntity"]) + yield Node("GO", go_node, ["BioEntity"]) for hgnc_id in self.df["HGNC_ID"].unique(): yield Node("HGNC", hgnc_id, ["BioEntity"]) From 857d4e40e0b2fa2b362a3d00f7f2fc75de2e5634 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 19:35:31 -0400 Subject: [PATCH 15/34] Fix pathways prefix and add main --- src/indra_cogex/sources/pathways/__init__.py | 6 +----- src/indra_cogex/sources/pathways/__main__.py | 9 +++++++++ 2 files changed, 10 insertions(+), 5 deletions(-) create mode 100644 src/indra_cogex/sources/pathways/__main__.py diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py index 770073a48..5f7c4111d 100644 --- a/src/indra_cogex/sources/pathways/__init__.py +++ b/src/indra_cogex/sources/pathways/__init__.py @@ -33,7 +33,7 @@ class PyoboProcessor(Processor): def get_nodes(self): # noqa:D102 # TODO add license version = pyobo.api.utils.get_version(self.prefix) - for identifier, name in pyobo.get_id_name_mapping("wikipathways").items(): + for identifier, name in pyobo.get_id_name_mapping(self.prefix).items(): db_ns, db_id = get_ns_id_from_identifiers(self.prefix, identifier) yield Node( db_ns, @@ -92,7 +92,3 @@ class ReactomeProcessor(PyoboProcessor): relation = has_part relation_label = "haspart" importable = True - - -if __name__ == "__main__": - WikipathwaysProcessor.cli() diff --git a/src/indra_cogex/sources/pathways/__main__.py b/src/indra_cogex/sources/pathways/__main__.py new file mode 100644 index 000000000..2e9b46bff --- /dev/null +++ b/src/indra_cogex/sources/pathways/__main__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- + +"""Run the pathways processor using ``python -m indra_cogex.sources.pathways``.""" + +from . import ReactomeProcessor, WikipathwaysProcessor + +if __name__ == "__main__": + ReactomeProcessor.cli() + WikipathwaysProcessor.cli() From 259d75acaaf3a0fe2aebcd8db2e4280cd9ce922e Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 19:40:01 -0400 Subject: [PATCH 16/34] Handle UP isoforms --- src/indra_cogex/sources/pathways/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py index 5f7c4111d..88eab72bf 100644 --- a/src/indra_cogex/sources/pathways/__init__.py +++ b/src/indra_cogex/sources/pathways/__init__.py @@ -66,6 +66,10 @@ def get_gene(self, prefix, identifier): else: return "EGID", identifier elif prefix == "uniprot": + # Some of the UniProt IDs are isoforms, for now, we just strip + # these off. We could do something more principled later. + if "-" in identifier: + identifier, _ = identifier.split("-") hgnc_id = uniprot_client.get_hgnc_id(identifier) if hgnc_id: return "HGNC", hgnc_id From 58b3b9235969de303c1a1d832e363a78c8ab281b Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 21:17:09 -0400 Subject: [PATCH 17/34] Implement ID fixing for SIF dump --- src/indra_cogex/sources/indra_db/__init__.py | 24 +++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py index a943f965d..d36dbe2a5 100644 --- a/src/indra_cogex/sources/indra_db/__init__.py +++ b/src/indra_cogex/sources/indra_db/__init__.py @@ -2,9 +2,11 @@ """Processor for the INDRA database.""" +import json import logging import pickle from pathlib import Path +from tqdm import tqdm from typing import Union import humanize @@ -12,10 +14,12 @@ import pystow from indra.ontology.bio import bio_ontology +from indra.databases.identifiers import ensure_prefix_if_needed from indra_cogex.representation import Node, Relation from indra_cogex.sources.processor import Processor logger = logging.getLogger(__name__) +tqdm.pandas() # If you don't have the data, get it from: @@ -41,6 +45,7 @@ def __init__(self, path: Union[None, str, Path] = None): df = pickle.load(fh) logger.info("Loaded %s rows from %s", humanize.intword(len(df)), path) self.df = df + logger.info("Fixing ID and naming issues...") for side in "AB": # A lot of the names in the SIF dump are all over self.df[f"ag{side}_name"] = [ @@ -49,15 +54,19 @@ def __init__(self, path: Union[None, str, Path] = None): [f"ag{side}_ns", f"ag{side}_id"] ].values ] + self.df[f"ag{side}_id"] = self.df.progress_apply( + lambda row: fix_id(row[f"ag{side}_ns"], row[f"ag{side}_id"]), axis=1 + ) + self.df["source_counts"] = self.df["source_counts"].apply(json.dumps) def get_nodes(self): # noqa:D102 df = pd.concat( [ self.df[["agA_ns", "agA_id", "agA_name"]].rename( - {"agA_ns": "ns", "agA_id": "id", "agA_name": "name"} + columns={"agA_ns": "ns", "agA_id": "id", "agA_name": "name"} ), self.df[["agB_ns", "agB_id", "agB_name"]].rename( - {"agB_ns": "ns", "agB_id": "id", "agB_name": "name"} + columns={"agB_ns": "ns", "agB_id": "id", "agB_name": "name"} ), ], ignore_index=True, @@ -86,7 +95,7 @@ def get_relations(self): # noqa:D102 ) in ( self.df[columns].drop_duplicates().values ): - data = {"stmt_hash:long": stmt_hash, "evidence_count:str": source_counts} + data = {"stmt_hash:long": stmt_hash, "evidence_count:string": source_counts} yield Relation( source_ns, source_id, @@ -95,3 +104,12 @@ def get_relations(self): # noqa:D102 [stmt_type, "Statement"], data, ) + + +def fix_id(db_ns, db_id): + if db_ns == "GO": + if db_id.isnumeric(): + db_id = "0" * (7 - len(db_id)) + db_id + + db_id = ensure_prefix_if_needed(db_ns, db_id) + return db_id From 86df4c2aaa66fbf3c6cbcd104211c0e9894afb57 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 21:26:32 -0400 Subject: [PATCH 18/34] Improve the import approach, still issues to fix --- src/indra_cogex/sources/cli.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py index 8e892d087..007c96744 100644 --- a/src/indra_cogex/sources/cli.py +++ b/src/indra_cogex/sources/cli.py @@ -48,16 +48,21 @@ def main(load: bool, load_only: bool, force: bool): ): click.secho("Processing...", fg="green") processor = processor_cls() + # FIXME: this is redundant, we get nodes twice na.add_nodes(list(processor.get_nodes())) processor.dump() paths.append((processor_cls.nodes_path, processor_cls.edges_path)) - # Now create and dump the assembled nodes - assembled_nodes = na.assemble_nodes() - assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id)) - metadata = sorted(set(key for node in assembled_nodes for key in node.data)) + # FIXME: This doesn't work unless the processors are also running and + # getting nodes nodes_path = pystow.module("indra", "cogex", "assembled").join(name="nodes.tsv.gz") - Processor._dump_nodes_to_path(assembled_nodes, metadata, nodes_path) + if not load_only: + if force or not nodes_path.is_file(): + # Now create and dump the assembled nodes + assembled_nodes = na.assemble_nodes() + assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id)) + metadata = sorted(set(key for node in assembled_nodes for key in node.data)) + Processor._dump_nodes_to_path(assembled_nodes, metadata, nodes_path) if load or load_only: command = dedent( @@ -66,8 +71,10 @@ def main(load: bool, load_only: bool, force: bool): --database=indra \\ --delimiter='TAB' \\ --skip-duplicate-nodes=true \\ - --skip-bad-relationships=true + --skip-bad-relationships=true \\ + --nodes %s """ + % nodes_path ).rstrip() for _, edge_path in paths: command += f"\\\n --relationships {edge_path}" From 1d57332bd5105771e5770edc5d2a1a8bd1686dc5 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 21:28:41 -0400 Subject: [PATCH 19/34] Switch to f-string --- src/indra_cogex/sources/cli.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py index 007c96744..a4767572e 100644 --- a/src/indra_cogex/sources/cli.py +++ b/src/indra_cogex/sources/cli.py @@ -66,15 +66,14 @@ def main(load: bool, load_only: bool, force: bool): if load or load_only: command = dedent( - """\ + f"""\ neo4j-admin import \\ --database=indra \\ --delimiter='TAB' \\ --skip-duplicate-nodes=true \\ --skip-bad-relationships=true \\ - --nodes %s + --nodes {nodes_path} """ - % nodes_path ).rstrip() for _, edge_path in paths: command += f"\\\n --relationships {edge_path}" From 3b7b2cf1d95c63ea9dca2f3b3a7dab636b9672e4 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 21:47:05 -0400 Subject: [PATCH 20/34] Fix assembly code --- src/indra_cogex/assembly/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py index cd59194a2..d5456171c 100644 --- a/src/indra_cogex/assembly/__init__.py +++ b/src/indra_cogex/assembly/__init__.py @@ -26,12 +26,12 @@ def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node: labels = set() data = {} for node in nodes: - labels |= node.labels + labels |= set(node.labels) for data_key, data_val in node.data.items(): previous_val = data.get(data_key) if previous_val and previous_val != data_val: self.conflicts.append( - Conflict(f"{data_key}:{previous_val}"), f"{data_key}:{data_val}" + Conflict(f"{data_key}:{previous_val}", f"{data_key}:{data_val}") ) else: data[data_key] = data_val From 434f9a0947c691ab882c6442371c0432b94d22f7 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 21:57:30 -0400 Subject: [PATCH 21/34] Change implementation of conflicts and test --- src/indra_cogex/assembly/__init__.py | 14 +++++----- tests/test_assembly.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 7 deletions(-) create mode 100644 tests/test_assembly.py diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py index d5456171c..165a7fe8c 100644 --- a/src/indra_cogex/assembly/__init__.py +++ b/src/indra_cogex/assembly/__init__.py @@ -30,21 +30,21 @@ def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node: for data_key, data_val in node.data.items(): previous_val = data.get(data_key) if previous_val and previous_val != data_val: - self.conflicts.append( - Conflict(f"{data_key}:{previous_val}", f"{data_key}:{data_val}") - ) + self.conflicts.append(Conflict(data_key, previous_val, data_val)) else: data[data_key] = data_val + labels = sorted(labels) return Node(db_ns, db_id, labels, data) class Conflict: - def __init__(self, first, second): - self.first = first - self.second = second + def __init__(self, key, val1, val2): + self.key = key + self.val1 = val1 + self.val2 = val2 def __repr__(self): return str(self) def __str__(self): - return f"Conflict({self.first}, {self.second})" + return f"Conflict({self.key}, {self.val1}, {self.val2})" diff --git a/tests/test_assembly.py b/tests/test_assembly.py new file mode 100644 index 000000000..e6185c38d --- /dev/null +++ b/tests/test_assembly.py @@ -0,0 +1,40 @@ +from indra_cogex.assembly import NodeAssembler +from indra_cogex.representation import Node + + +def test_add_nodes(): + na = NodeAssembler([Node("x", "y", ["l"])]) + assert len(na.nodes) == 1 + na.add_nodes([Node("y", "z", ["l"])]) + assert len(na.nodes) == 2 + + +def test_merge_properties(): + n1 = Node("ns", "id", ["l"], {"k1": "v1"}) + n2 = Node("ns", "id", ["l"], {"k2": "v2"}) + na = NodeAssembler([n1, n2]) + ans = na.assemble_nodes() + assert len(ans) == 1 + assert ans[0].data == {"k1": "v1", "k2": "v2"} + + +def test_merge_labels(): + n1 = Node("ns", "id", ["l1", "l2"]) + n2 = Node("ns", "id", ["l2", "l3"]) + na = NodeAssembler([n1, n2]) + ans = na.assemble_nodes() + assert len(ans) == 1 + assert set(ans[0].labels) == {"l1", "l2", "l3"} + + +def test_merge_conflict(): + n1 = Node("ns", "id", ["l"], {"k1": "v1"}) + n2 = Node("ns", "id", ["l"], {"k1": "v2"}) + na = NodeAssembler([n1, n2]) + ans = na.assemble_nodes() + assert len(ans) == 1 + assert ans[0].data == {"k1": "v1"} + assert len(na.conflicts) == 1 + assert na.conflicts[0].key == "k1" + assert na.conflicts[0].val1 == "v1" + assert na.conflicts[0].val2 == "v2" From b890977d40e2c3b5fb2eaf20c0cfc9e4e4e248a7 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Thu, 13 May 2021 22:11:36 -0400 Subject: [PATCH 22/34] Fix assembled node dumping --- src/indra_cogex/sources/cli.py | 3 +-- src/indra_cogex/sources/processor.py | 12 +++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/indra_cogex/sources/cli.py b/src/indra_cogex/sources/cli.py index a4767572e..f29759ae7 100644 --- a/src/indra_cogex/sources/cli.py +++ b/src/indra_cogex/sources/cli.py @@ -61,8 +61,7 @@ def main(load: bool, load_only: bool, force: bool): # Now create and dump the assembled nodes assembled_nodes = na.assemble_nodes() assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id)) - metadata = sorted(set(key for node in assembled_nodes for key in node.data)) - Processor._dump_nodes_to_path(assembled_nodes, metadata, nodes_path) + Processor._dump_nodes_to_path(assembled_nodes, nodes_path) if load or load_only: command = dedent( diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py index ef5c28ee1..8af4aec80 100644 --- a/src/indra_cogex/sources/processor.py +++ b/src/indra_cogex/sources/processor.py @@ -81,12 +81,12 @@ def _dump_nodes(self) -> Path: return self.nodes_path nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id)) - metadata = sorted(set(key for node in nodes for key in node.data)) - self._dump_nodes_to_path(nodes, metadata, self.nodes_path, sample_path) + self._dump_nodes_to_path(nodes, self.nodes_path, sample_path) @staticmethod - def _dump_nodes_to_path(nodes, metadata, nodes_path, sample_path=None): + def _dump_nodes_to_path(nodes, nodes_path, sample_path=None): nodes = list(validate_nodes(nodes)) + metadata = sorted(set(key for node in nodes for key in node.data)) node_rows = ( ( norm_id(node.db_ns, node.db_id), @@ -96,16 +96,14 @@ def _dump_nodes_to_path(nodes, metadata, nodes_path, sample_path=None): for node in tqdm(nodes, desc="Nodes", unit_scale=True) ) + header = "id:ID", ":LABEL", *metadata with gzip.open(nodes_path, mode="wt") as node_file: node_writer = csv.writer(node_file, delimiter="\t") # type: ignore + node_writer.writerow(header) if sample_path: with sample_path.open("w") as node_sample_file: node_sample_writer = csv.writer(node_sample_file, delimiter="\t") - - header = "id:ID", ":LABEL", *metadata node_sample_writer.writerow(header) - node_writer.writerow(header) - for _, node_row in zip(range(10), node_rows): node_sample_writer.writerow(node_row) node_writer.writerow(node_row) From a93c41a6faaec376a229ebab8dbce6b06facf97d Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Fri, 14 May 2021 00:40:32 -0400 Subject: [PATCH 23/34] Change labels to rel_type and fix node label separator --- src/indra_cogex/representation.py | 11 +++++------ src/indra_cogex/sources/bgee/__init__.py | 2 +- src/indra_cogex/sources/goa/__init__.py | 2 +- src/indra_cogex/sources/indra_db/__init__.py | 4 ++-- src/indra_cogex/sources/indra_ontology/__init__.py | 4 +--- src/indra_cogex/sources/pathways/__init__.py | 2 +- src/indra_cogex/sources/processor.py | 4 ++-- 7 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py index 7b5581d60..4d3000413 100644 --- a/src/indra_cogex/representation.py +++ b/src/indra_cogex/representation.py @@ -77,21 +77,21 @@ def __init__( source_id: str, target_ns: str, target_id: str, - labels: Collection[str], + rel_type: str, data: Optional[Mapping[str, Any]] = None, ): """Initialize the relation. :param source_id: The identifier of the source node :param target_id: The identifier of the target node - :param labels: The collection of labels for the relation. + :param rel_type: The relation's type. :param data: The optional data dictionary associated with the relation. """ self.source_ns = source_ns self.source_id = source_id self.target_ns = target_ns self.target_id = target_id - self.labels = list(labels) + self.rel_type = rel_type self.data = data if data else {} def to_json(self): @@ -101,15 +101,14 @@ def to_json(self): "source_id": self.source_id, "target_ns": self.target_ns, "target_id": self.target_id, - "labels": self.labels, + "rel_type": self.rel_type, "data": self.data, } def __str__(self): # noqa:D105 data_str = ", ".join(["%s:'%s'" % (k, v) for k, v in self.data.items()]) - labels_str = ":".join(self.labels) return ( - f"({self.source_ns}, {self.source_id})-[:{labels_str} {data_str}]->" + f"({self.source_ns}, {self.source_id})-[:{self.rel_type} {data_str}]->" f"({self.target_ns}, {self.target_id})" ) diff --git a/src/indra_cogex/sources/bgee/__init__.py b/src/indra_cogex/sources/bgee/__init__.py index e773490f9..79dae97ce 100644 --- a/src/indra_cogex/sources/bgee/__init__.py +++ b/src/indra_cogex/sources/bgee/__init__.py @@ -54,7 +54,7 @@ def get_relations(self): # noqa:D102 context_ns, context_id = get_context(context) for hgnc_id in hgnc_ids: yield Relation( - "HGNC", hgnc_id, context_ns, context_id, [self.rel_type], data + "HGNC", hgnc_id, context_ns, context_id, self.rel_type, data ) diff --git a/src/indra_cogex/sources/goa/__init__.py b/src/indra_cogex/sources/goa/__init__.py index a97155d88..44ca831ed 100644 --- a/src/indra_cogex/sources/goa/__init__.py +++ b/src/indra_cogex/sources/goa/__init__.py @@ -52,7 +52,7 @@ def get_relations(self): # noqa:D102 all_ecs = ",".join(sorted(set(ecs))) # Possible properties could be e.g., evidence codes data = {"evidence_codes:string": all_ecs, "source": self.name} - yield Relation("HGNC", hgnc_id, "GO", go_id, [rel_type], data) + yield Relation("HGNC", hgnc_id, "GO", go_id, rel_type, data) def load_goa(url: str) -> pd.DataFrame: diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py index d36dbe2a5..01ebf9a0a 100644 --- a/src/indra_cogex/sources/indra_db/__init__.py +++ b/src/indra_cogex/sources/indra_db/__init__.py @@ -95,13 +95,13 @@ def get_relations(self): # noqa:D102 ) in ( self.df[columns].drop_duplicates().values ): - data = {"stmt_hash:long": stmt_hash, "evidence_count:string": source_counts} + data = {"stmt_hash:long": stmt_hash, "source_counts:string": source_counts} yield Relation( source_ns, source_id, target_ns, target_id, - [stmt_type, "Statement"], + stmt_type, data, ) diff --git a/src/indra_cogex/sources/indra_ontology/__init__.py b/src/indra_cogex/sources/indra_ontology/__init__.py index 8f69957f1..b980dcbbb 100644 --- a/src/indra_cogex/sources/indra_ontology/__init__.py +++ b/src/indra_cogex/sources/indra_ontology/__init__.py @@ -43,6 +43,4 @@ def get_relations(self): # noqa:D102 target_ns, target_id = self.ontology.get_ns_id(target) data = copy.copy(data) edge_type = data.pop("type") - yield Relation( - source_ns, source_id, target_ns, target_id, [edge_type], data - ) + yield Relation(source_ns, source_id, target_ns, target_id, edge_type, data) diff --git a/src/indra_cogex/sources/pathways/__init__.py b/src/indra_cogex/sources/pathways/__init__.py index 88eab72bf..fb4e59ade 100644 --- a/src/indra_cogex/sources/pathways/__init__.py +++ b/src/indra_cogex/sources/pathways/__init__.py @@ -54,7 +54,7 @@ def get_relations(self): # noqa:D102 pathway_id, gene_ns, gene_id, - [self.relation_label], + self.relation_label, dict(source=self.name), ) diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py index 8af4aec80..3926a2920 100644 --- a/src/indra_cogex/sources/processor.py +++ b/src/indra_cogex/sources/processor.py @@ -90,7 +90,7 @@ def _dump_nodes_to_path(nodes, nodes_path, sample_path=None): node_rows = ( ( norm_id(node.db_ns, node.db_id), - "|".join(node.labels), + ";".join(node.labels), *[node.data.get(key, "") for key in metadata], ) for node in tqdm(nodes, desc="Nodes", unit_scale=True) @@ -127,7 +127,7 @@ def _dump_edges(self) -> Path: ( norm_id(rel.source_ns, rel.source_id), norm_id(rel.target_ns, rel.target_id), - "|".join(sorted(rel.labels)), + rel.rel_type, *[rel.data.get(key) for key in metadata], ) for rel in tqdm(rels, desc="Edges", unit_scale=True) From 84fa3f4c368fe87d710d41e1f85846f12499ad0d Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Fri, 14 May 2021 10:52:44 -0400 Subject: [PATCH 24/34] Fix more ID issues on import --- src/indra_cogex/sources/indra_db/__init__.py | 9 ++++++++- tests/test_indra_db.py | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 tests/test_indra_db.py diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py index 01ebf9a0a..788433cea 100644 --- a/src/indra_cogex/sources/indra_db/__init__.py +++ b/src/indra_cogex/sources/indra_db/__init__.py @@ -110,6 +110,13 @@ def fix_id(db_ns, db_id): if db_ns == "GO": if db_id.isnumeric(): db_id = "0" * (7 - len(db_id)) + db_id - + if db_ns == "EFO" and db_id.startswith("EFO:"): + db_id = db_id[4:] + # FIXME: we need to be able to fix namespace as well, not just IDs, + # requires refactoring + # if db_ns == 'UP' and db_id.startswith('SL'): + # db_ns = 'UPLOC' + if db_ns == "UP" and "-" in db_id and not db_id.startswith("SL-"): + db_id = db_id.split("-")[0] db_id = ensure_prefix_if_needed(db_ns, db_id) return db_id diff --git a/tests/test_indra_db.py b/tests/test_indra_db.py new file mode 100644 index 000000000..0491be953 --- /dev/null +++ b/tests/test_indra_db.py @@ -0,0 +1,8 @@ +from indra_cogex.sources.indra_db import fix_id + + +def test_fix_id(): + assert fix_id("EFO", "EFO:12345") == "12345" + assert fix_id("GO", "123") == "GO:0000123" + assert fix_id("CHEBI", "123") == "CHEBI:123" + assert fix_id("UP", "P12345-6") == "P12345" From ea9286c873465a1521d86a94b803be9304e2ca5b Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 14 May 2021 17:26:08 -0400 Subject: [PATCH 25/34] Update meta --- .flake8 | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.flake8 b/.flake8 index a711387bb..09307d270 100644 --- a/.flake8 +++ b/.flake8 @@ -7,7 +7,7 @@ ignore = S403 # pickle S301 # pickle W503 # line break before binary operator - E203 # conflicts with black + S101 # Don't complain about asserts exclude = .tox, .git, diff --git a/tox.ini b/tox.ini index 7273319ac..5a530b75f 100644 --- a/tox.ini +++ b/tox.ini @@ -79,7 +79,7 @@ description = Run the flake8 tool with several plugins (bandit, docstrings, impo [testenv:black] deps = black skip_install = true -commands = black src/ tests/ setup.py +commands = black src/indra_cogex tests/ setup.py description = Run the black tool [testenv:mypy] From 5571b78266f65e60e58fc0a239905934d3e6c4a8 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sat, 15 May 2021 22:20:24 -0400 Subject: [PATCH 26/34] Generalize ID fixing to name spaces --- src/indra_cogex/sources/indra_db/__init__.py | 26 +++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py index 788433cea..01fd12e1e 100644 --- a/src/indra_cogex/sources/indra_db/__init__.py +++ b/src/indra_cogex/sources/indra_db/__init__.py @@ -7,7 +7,7 @@ import pickle from pathlib import Path from tqdm import tqdm -from typing import Union +from typing import Tuple, Union import humanize import pandas as pd @@ -42,6 +42,7 @@ def __init__(self, path: Union[None, str, Path] = None): elif isinstance(path, str): path = Path(path) with open(path, "rb") as fh: + logger.info("Loading %s" % path) df = pickle.load(fh) logger.info("Loaded %s rows from %s", humanize.intword(len(df)), path) self.df = df @@ -54,8 +55,16 @@ def __init__(self, path: Union[None, str, Path] = None): [f"ag{side}_ns", f"ag{side}_id"] ].values ] - self.df[f"ag{side}_id"] = self.df.progress_apply( - lambda row: fix_id(row[f"ag{side}_ns"], row[f"ag{side}_id"]), axis=1 + breakpoint() + self.df[f"ag{side}_ns"], self.df[f"ag{side}_id"] = list( + zip( + *[ + fix_id(db_ns, db_id) + for db_ns, db_id in tqdm( + zip(list(df[f"ag{side}_ns"]), list(df[f"ag{side}_id"])) + ) + ] + ) ) self.df["source_counts"] = self.df["source_counts"].apply(json.dumps) @@ -106,17 +115,16 @@ def get_relations(self): # noqa:D102 ) -def fix_id(db_ns, db_id): +def fix_id(db_ns: str, db_id: str) -> Tuple[str, str]: + """Fix ID issues specific to the SIF dump.""" if db_ns == "GO": if db_id.isnumeric(): db_id = "0" * (7 - len(db_id)) + db_id if db_ns == "EFO" and db_id.startswith("EFO:"): db_id = db_id[4:] - # FIXME: we need to be able to fix namespace as well, not just IDs, - # requires refactoring - # if db_ns == 'UP' and db_id.startswith('SL'): - # db_ns = 'UPLOC' + if db_ns == "UP" and db_id.startswith("SL"): + db_ns = "UPLOC" if db_ns == "UP" and "-" in db_id and not db_id.startswith("SL-"): db_id = db_id.split("-")[0] db_id = ensure_prefix_if_needed(db_ns, db_id) - return db_id + return db_ns, db_id From ba185df80d09382e67e0fbe5a8f8a52f3a719a70 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sat, 15 May 2021 22:32:27 -0400 Subject: [PATCH 27/34] Add total and description --- src/indra_cogex/sources/indra_db/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/indra_cogex/sources/indra_db/__init__.py b/src/indra_cogex/sources/indra_db/__init__.py index 01fd12e1e..5771e0e8b 100644 --- a/src/indra_cogex/sources/indra_db/__init__.py +++ b/src/indra_cogex/sources/indra_db/__init__.py @@ -55,13 +55,14 @@ def __init__(self, path: Union[None, str, Path] = None): [f"ag{side}_ns", f"ag{side}_id"] ].values ] - breakpoint() self.df[f"ag{side}_ns"], self.df[f"ag{side}_id"] = list( zip( *[ fix_id(db_ns, db_id) for db_ns, db_id in tqdm( - zip(list(df[f"ag{side}_ns"]), list(df[f"ag{side}_id"])) + zip(list(df[f"ag{side}_ns"]), list(df[f"ag{side}_id"])), + total=len(df), + desc="Fixing IDs", ) ] ) From 3538529117aebbfad6f42066d97440206b34948e Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sat, 15 May 2021 22:49:26 -0400 Subject: [PATCH 28/34] Fix mypy issues --- src/indra_cogex/assembly/__init__.py | 9 ++++----- src/indra_cogex/sources/processor.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py index 165a7fe8c..9d6f83306 100644 --- a/src/indra_cogex/assembly/__init__.py +++ b/src/indra_cogex/assembly/__init__.py @@ -1,12 +1,12 @@ from collections import defaultdict from typing import List, Optional -from indra_cogex.representation import Node +from indra_cogex.representation import Dict, Node class NodeAssembler: def __init__(self, nodes: Optional[List[Node]] = None): self.nodes = nodes if nodes else [] - self.conflicts = [] + self.conflicts: List[Conflict] = [] def add_nodes(self, nodes: List[Node]): self.nodes += nodes @@ -24,7 +24,7 @@ def assemble_nodes(self) -> List[Node]: def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node: labels = set() - data = {} + data: Dict[str, str] = {} for node in nodes: labels |= set(node.labels) for data_key, data_val in node.data.items(): @@ -33,8 +33,7 @@ def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node: self.conflicts.append(Conflict(data_key, previous_val, data_val)) else: data[data_key] = data_val - labels = sorted(labels) - return Node(db_ns, db_id, labels, data) + return Node(db_ns, db_id, sorted(labels), data) class Conflict: diff --git a/src/indra_cogex/sources/processor.py b/src/indra_cogex/sources/processor.py index 3926a2920..1d4d57974 100644 --- a/src/indra_cogex/sources/processor.py +++ b/src/indra_cogex/sources/processor.py @@ -81,7 +81,7 @@ def _dump_nodes(self) -> Path: return self.nodes_path nodes = sorted(self.get_nodes(), key=lambda x: (x.db_ns, x.db_id)) - self._dump_nodes_to_path(nodes, self.nodes_path, sample_path) + return self._dump_nodes_to_path(nodes, self.nodes_path, sample_path) @staticmethod def _dump_nodes_to_path(nodes, nodes_path, sample_path=None): From c3802d2751348ac4b704296b01b95ee4c5fe1d68 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sat, 15 May 2021 22:51:29 -0400 Subject: [PATCH 29/34] Just test the code --- .github/workflows/tests.yml | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bb7d41c21..318e9b33e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -3,43 +3,6 @@ name: Tests on: [ push, pull_request ] jobs: - lint: - name: Lint - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [ 3.6, 3.9 ] - tox-env: [ manifest, flake8, pyroma, mypy ] - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: pip install tox - - name: Run tox - run: tox -e ${{ matrix.tox-env }} - docs: - name: Documentation - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [ 3.6, 3.9 ] - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: pip install tox - - name: Check RST conformity with doc8 - run: tox -e doc8 - # - name: Check docstring coverage - # run: tox -e docstr-coverage - - name: Check documentation build with Sphinx - run: tox -e docs tests: name: Tests runs-on: ${{ matrix.os }} From 736550e2aa111abeb0a0c939573a3bcd4585ecec Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sun, 16 May 2021 16:52:56 -0400 Subject: [PATCH 30/34] Address issue with install I thought the `__init__.py` wasn't required anymore but maybe it still is --- src/indra_cogex/__init__.py | 0 tox.ini | 2 -- 2 files changed, 2 deletions(-) create mode 100644 src/indra_cogex/__init__.py diff --git a/src/indra_cogex/__init__.py b/src/indra_cogex/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tox.ini b/tox.ini index 5a530b75f..b4bcc7215 100644 --- a/tox.ini +++ b/tox.ini @@ -28,8 +28,6 @@ passenv = deps = coverage pytest -extras = - pandas whitelist_externals = /bin/cat /bin/cp From 7d876c0dae73f93738cc2b99c3004a352c32748f Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sun, 16 May 2021 16:53:07 -0400 Subject: [PATCH 31/34] Fix import --- src/indra_cogex/assembly/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/indra_cogex/assembly/__init__.py b/src/indra_cogex/assembly/__init__.py index 9d6f83306..a81b6489a 100644 --- a/src/indra_cogex/assembly/__init__.py +++ b/src/indra_cogex/assembly/__init__.py @@ -1,6 +1,7 @@ from collections import defaultdict -from typing import List, Optional -from indra_cogex.representation import Dict, Node +from typing import Dict, List, Optional + +from indra_cogex.representation import Node class NodeAssembler: From be3300ec7e92a9c8e85607a9c3c69a0bb008ee25 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sun, 16 May 2021 16:53:19 -0400 Subject: [PATCH 32/34] Add pyobo as requirement --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index a30697e58..81a072de1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,7 @@ install_requires = more_click class-resolver>=0.0.9 pystow>=0.1.6 + pyobo include_package_data = True python_requires = >=3.6 From 983d576fb28e2fa355e132b9937bdf07c78b035c Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 16 May 2021 21:33:44 -0400 Subject: [PATCH 33/34] Install INDRA from github --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 81a072de1..e05db7001 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,7 +36,7 @@ keywords = [options] install_requires = - indra + indra @ git+https://github.com/sorgerlab/indra.git neo4j click more_click From e7bb294570ba7fa46d6c0e524c5e50e36b429d4f Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 16 May 2021 21:54:05 -0400 Subject: [PATCH 34/34] Fix ID fixing test --- tests/test_indra_db.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_indra_db.py b/tests/test_indra_db.py index 0491be953..9172c1e45 100644 --- a/tests/test_indra_db.py +++ b/tests/test_indra_db.py @@ -2,7 +2,8 @@ def test_fix_id(): - assert fix_id("EFO", "EFO:12345") == "12345" - assert fix_id("GO", "123") == "GO:0000123" - assert fix_id("CHEBI", "123") == "CHEBI:123" - assert fix_id("UP", "P12345-6") == "P12345" + assert fix_id("EFO", "EFO:12345") == ("EFO", "12345") + assert fix_id("GO", "123") == ("GO", "GO:0000123") + assert fix_id("CHEBI", "123") == ("CHEBI", "CHEBI:123") + assert fix_id("UP", "P12345-6") == ("UP", "P12345") + assert fix_id("UP", "SL-123") == ("UPLOC", "SL-123")