Skip to content

Commit

Permalink
Make standardize function generally reusable
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt authored and bgyori committed Oct 27, 2021
1 parent ef5eefa commit 0fd3329
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 20 deletions.
16 changes: 15 additions & 1 deletion src/indra_cogex/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

"""Representations for nodes and relations to upload to Neo4j."""

from typing import Any, Collection, Mapping, Optional
from typing import Any, Collection, Mapping, Optional, Tuple

__all__ = ["Node", "Relation"]

from indra.databases import identifiers
from indra.ontology.standardize import get_standard_name, standardize_db_refs
from indra.statements.agent import get_grounding


class Node:
Expand Down Expand Up @@ -118,6 +120,18 @@ def __repr__(self): # noqa:D105
return str(self)


def standardize(
prefix: str, identifier: str, name: Optional[str] = None
) -> Tuple[str, str, str]:
"""Get a standardized prefix, identifier, and name, if possible."""
db_refs = standardize_db_refs({prefix: identifier})
db_ns, db_id = get_grounding(db_refs)
if db_ns is None or db_id is None:
return prefix, identifier, name
name = get_standard_name(db_refs) or name
return db_ns, db_id, name


def norm_id(db_ns, db_id):
identifiers_ns = identifiers.get_identifiers_ns(db_ns)
identifiers_id = db_id
Expand Down
23 changes: 4 additions & 19 deletions src/indra_cogex/sources/chembl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,13 @@
"""Processor for ChEMBL."""

import logging
from typing import Iterable, Optional, Tuple
from typing import Iterable, Optional

import bioversions
import chembl_downloader
from tqdm import tqdm

from indra.ontology.standardize import (
get_grounding,
get_standard_name,
standardize_db_refs,
)
from indra_cogex.representation import Node, Relation
from indra_cogex.representation import Node, Relation, standardize
from indra_cogex.sources.processor import Processor

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -53,7 +48,7 @@ def __init__(self, version: Optional[str] = None):
for chembl_id, chembl_name in tqdm(
chemical_df.values, unit_scale=True, desc="caching chemicals"
):
db_ns, db_id, name = self._standardize("CHEMBL", chembl_id, chembl_name)
db_ns, db_id, name = standardize("CHEMBL", chembl_id, chembl_name)
self.chemicals[chembl_id] = Node(
db_ns,
db_id,
Expand All @@ -65,7 +60,7 @@ def __init__(self, version: Optional[str] = None):
for mesh_id in tqdm(
self.df.mesh_id.unique(), unit_scale=True, desc="caching indications"
):
db_ns, db_id, name = self._standardize("MESH", mesh_id)
db_ns, db_id, name = standardize("MESH", mesh_id)
if name is None:
tqdm.write(f"no name found for MESH:{mesh_id}")
self.indications[mesh_id] = Node(
Expand All @@ -75,16 +70,6 @@ def __init__(self, version: Optional[str] = None):
dict(name=name),
)

def _standardize(
self, prefix: str, identifier: str, name: Optional[str] = None
) -> Tuple[str, str, str]:
db_refs = standardize_db_refs({prefix: identifier})
db_ns, db_id = get_grounding(db_refs)
if db_ns is None or db_id is None:
return prefix, identifier, name
name = get_standard_name(db_refs) or name
return db_ns, db_id, name

def get_nodes(self) -> Iterable[Node]:
"""Iterate over ChEMBL chemicals and indications"""
yield from self.chemicals.values()
Expand Down

0 comments on commit 0fd3329

Please sign in to comment.