Skip to content

Commit

Permalink
Merge pull request #24 from bgyori/inputs
Browse files Browse the repository at this point in the history
Implement node assembly and standardize identifiers
  • Loading branch information
bgyori authored May 17, 2021
2 parents e9a4e1e + e7bb294 commit 477df79
Show file tree
Hide file tree
Showing 18 changed files with 379 additions and 150 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ ignore =
S403 # pickle
S301 # pickle
W503 # line break before binary operator
E203 # conflicts with black
S101 # Don't complain about asserts
exclude =
.tox,
.git,
Expand Down
37 changes: 0 additions & 37 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,6 @@ name: Tests
on: [ push, pull_request ]

jobs:
lint:
name: Lint
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ 3.6, 3.9 ]
tox-env: [ manifest, flake8, pyroma, mypy ]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: pip install tox
- name: Run tox
run: tox -e ${{ matrix.tox-env }}
docs:
name: Documentation
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ 3.6, 3.9 ]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: pip install tox
- name: Check RST conformity with doc8
run: tox -e doc8
# - name: Check docstring coverage
# run: tox -e docstr-coverage
- name: Check documentation build with Sphinx
run: tox -e docs
tests:
name: Tests
runs-on: ${{ matrix.os }}
Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,13 @@ keywords =

[options]
install_requires =
indra
indra @ git+https://github.com/sorgerlab/indra.git
neo4j
click
more_click
class-resolver>=0.0.9
pystow>=0.1.6
pyobo

include_package_data = True
python_requires = >=3.6
Expand Down
Empty file added src/indra_cogex/__init__.py
Empty file.
50 changes: 50 additions & 0 deletions src/indra_cogex/assembly/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from collections import defaultdict
from typing import Dict, List, Optional

from indra_cogex.representation import Node


class NodeAssembler:
def __init__(self, nodes: Optional[List[Node]] = None):
self.nodes = nodes if nodes else []
self.conflicts: List[Conflict] = []

def add_nodes(self, nodes: List[Node]):
self.nodes += nodes

def assemble_nodes(self) -> List[Node]:
nodes_by_id = defaultdict(list)
for node in self.nodes:
nodes_by_id[(node.db_ns, node.db_id)].append(node)

assembled_nodes = [
self.get_aggregate_node(db_ns, db_id, node_group)
for (db_ns, db_id), node_group in nodes_by_id.items()
]
return assembled_nodes

def get_aggregate_node(self, db_ns: str, db_id: str, nodes: List[Node]) -> Node:
labels = set()
data: Dict[str, str] = {}
for node in nodes:
labels |= set(node.labels)
for data_key, data_val in node.data.items():
previous_val = data.get(data_key)
if previous_val and previous_val != data_val:
self.conflicts.append(Conflict(data_key, previous_val, data_val))
else:
data[data_key] = data_val
return Node(db_ns, db_id, sorted(labels), data)


class Conflict:
def __init__(self, key, val1, val2):
self.key = key
self.val1 = val1
self.val2 = val2

def __repr__(self):
return str(self)

def __str__(self):
return f"Conflict({self.key}, {self.val1}, {self.val2})"
47 changes: 34 additions & 13 deletions src/indra_cogex/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,41 @@ class Node:

def __init__(
self,
identifier: str,
db_ns: str,
db_id: str,
labels: Collection[str],
data: Optional[Mapping[str, Any]] = None,
):
"""Initialize the node.
:param identifier: The identifier of the node
:param labels: The collection of labels for the relation.
:param data: The optional data dictionary associated with the node.
Parameters
----------
db_ns :
The namespace associated with the node. Uses the INDRA standard.
db_id :
The identifier within the namespace associated with the node.
Uses the INDRA standard.
labels :
A collection of labels for the node.
data :
An optional data dictionary associated with the node.
"""
self.identifier = identifier
if not db_ns or not db_id:
raise ValueError("Invalid namespace or ID.")
self.db_ns = db_ns
self.db_id = db_id
self.labels = labels
self.data = data if data else {}

def to_json(self):
"""Serialize the node to JSON."""
data = {k: v for k, v in self.data.items()}
data["id"] = self.identifier
data["db_ns"] = self.db_ns
data["db_id"] = self.db_id
return {"labels": self.labels, "data": data}

def _get_data_str(self):
pieces = ["id:'%s'" % self.identifier]
pieces = ["id:'%s:%s'" % (self.db_ns, self.db_id)]
for k, v in self.data.items():
if isinstance(v, str):
value = "'" + v.replace("'", "\\'") + "'"
Expand All @@ -60,36 +73,44 @@ class Relation:

def __init__(
self,
source_ns: str,
source_id: str,
target_ns: str,
target_id: str,
labels: Collection[str],
rel_type: str,
data: Optional[Mapping[str, Any]] = None,
):
"""Initialize the relation.
:param source_id: The identifier of the source node
:param target_id: The identifier of the target node
:param labels: The collection of labels for the relation.
:param rel_type: The relation's type.
:param data: The optional data dictionary associated with the relation.
"""
self.source_ns = source_ns
self.source_id = source_id
self.target_ns = target_ns
self.target_id = target_id
self.labels = list(labels)
self.rel_type = rel_type
self.data = data if data else {}

def to_json(self):
"""Serialize the relation to JSON."""
return {
"source_ns": self.source_ns,
"source_id": self.source_id,
"target_ns": self.target_ns,
"target_id": self.target_id,
"labels": self.labels,
"rel_type": self.rel_type,
"data": self.data,
}

def __str__(self): # noqa:D105
data_str = ", ".join(["%s:'%s'" % (k, v) for k, v in self.data.items()])
labels_str = ":".join(self.labels)
return f"({self.source_id})-[:{labels_str} {data_str}]->" f"({self.target_id})"
return (
f"({self.source_ns}, {self.source_id})-[:{self.rel_type} {data_str}]->"
f"({self.target_ns}, {self.target_id})"
)

def __repr__(self): # noqa:D105
return str(self)
24 changes: 20 additions & 4 deletions src/indra_cogex/sources/bgee/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,36 @@ def __init__(self, path: Union[None, str, Path] = None):
self.expressions = pickle.load(fh)

def get_nodes(self): # noqa:D102
for context_id in self.expressions:
for context in self.expressions:
context_ns, context_id = get_context(context)
yield Node(
context_ns,
context_id,
["BioEntity"],
data={"name": pyobo.get_name_by_curie(context_id)},
)
for hgnc_id in set.union(*[set(v) for v in self.expressions.values()]):
yield Node(
f"HGNC:{hgnc_id}",
"HGNC",
hgnc_id,
["BioEntity"],
data={"name": pyobo.get_name("hgnc", hgnc_id)},
)

def get_relations(self): # noqa:D102
for context_id, hgnc_ids in self.expressions.items():
data = {"source": self.name}
for context, hgnc_ids in self.expressions.items():
context_ns, context_id = get_context(context)
for hgnc_id in hgnc_ids:
yield Relation(f"HGNC:{hgnc_id}", context_id, [self.rel_type])
yield Relation(
"HGNC", hgnc_id, context_ns, context_id, self.rel_type, data
)


def get_context(context):
context_ns, context_id = context.split(":", maxsplit=1)
if context_ns == "UBERON":
context_id = f"UBERON:{context_id}"
elif context_ns == "CL":
context_id = f"CL:{context_id}"
return context_ns, context_id
25 changes: 21 additions & 4 deletions src/indra_cogex/sources/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
from textwrap import dedent

import click
import pystow
from more_click import verbose_option

from . import processor_resolver
from .processor import Processor
from ..assembly import NodeAssembler


@click.command()
Expand All @@ -32,6 +35,7 @@
def main(load: bool, load_only: bool, force: bool):
"""Generate and import Neo4j nodes and edges tables."""
paths = []
na = NodeAssembler()
for processor_cls in processor_resolver:
if not processor_cls.importable:
continue
Expand All @@ -44,21 +48,34 @@ def main(load: bool, load_only: bool, force: bool):
):
click.secho("Processing...", fg="green")
processor = processor_cls()
# FIXME: this is redundant, we get nodes twice
na.add_nodes(list(processor.get_nodes()))
processor.dump()
paths.append((processor_cls.nodes_path, processor_cls.edges_path))

# FIXME: This doesn't work unless the processors are also running and
# getting nodes
nodes_path = pystow.module("indra", "cogex", "assembled").join(name="nodes.tsv.gz")
if not load_only:
if force or not nodes_path.is_file():
# Now create and dump the assembled nodes
assembled_nodes = na.assemble_nodes()
assembled_nodes = sorted(assembled_nodes, key=lambda x: (x.db_ns, x.db_id))
Processor._dump_nodes_to_path(assembled_nodes, nodes_path)

if load or load_only:
command = dedent(
"""\
f"""\
neo4j-admin import \\
--database=indra \\
--delimiter='TAB' \\
--skip-duplicate-nodes=true \\
--skip-bad-relationships=true
--skip-bad-relationships=true \\
--nodes {nodes_path}
"""
).rstrip()
for node_path, edge_path in paths:
command += f"\\\n --nodes {node_path} \\\n --relationships {edge_path}"
for _, edge_path in paths:
command += f"\\\n --relationships {edge_path}"

click.secho("Running shell command:")
click.secho(command, fg="blue")
Expand Down
11 changes: 4 additions & 7 deletions src/indra_cogex/sources/goa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,17 @@ def __init__(self):

def get_nodes(self): # noqa:D102
for go_node in self.df["GO_ID"].unique():
yield Node(go_node, ["BioEntity"])
yield Node("GO", go_node, ["BioEntity"])
for hgnc_id in self.df["HGNC_ID"].unique():
yield Node(f"HGNC:{hgnc_id}", ["BioEntity"])
yield Node("HGNC", hgnc_id, ["BioEntity"])

def get_relations(self): # noqa:D102
rel_type = "associated_with"
for (go_id, hgnc_id), ecs in self.df.groupby(["GO_ID", "HGNC_ID"])["EC"]:
all_ecs = ",".join(sorted(set(ecs)))
source = f"HGNC:{hgnc_id}"
# Note that we don't add the extra GO: by current convention
target = go_id
# Possible properties could be e.g., evidence codes
data = {"evidence_codes:string": all_ecs}
yield Relation(source, target, [rel_type], data)
data = {"evidence_codes:string": all_ecs, "source": self.name}
yield Relation("HGNC", hgnc_id, "GO", go_id, rel_type, data)


def load_goa(url: str) -> pd.DataFrame:
Expand Down
Loading

0 comments on commit 477df79

Please sign in to comment.