Make standardize function generally reusable

gyorilab · Oct 27, 2021 · 0fd3329 · 0fd3329
1 parent ef5eefa
commit 0fd3329
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 20 deletions.
diff --git a/src/indra_cogex/representation.py b/src/indra_cogex/representation.py
@@ -2,11 +2,13 @@
 
 """Representations for nodes and relations to upload to Neo4j."""
 
-from typing import Any, Collection, Mapping, Optional
+from typing import Any, Collection, Mapping, Optional, Tuple
 
 __all__ = ["Node", "Relation"]
 
 from indra.databases import identifiers
+from indra.ontology.standardize import get_standard_name, standardize_db_refs
+from indra.statements.agent import get_grounding
 
 
 class Node:
@@ -118,6 +120,18 @@ def __repr__(self):  # noqa:D105
         return str(self)
 
 
+def standardize(
+    prefix: str, identifier: str, name: Optional[str] = None
+) -> Tuple[str, str, str]:
+    """Get a standardized prefix, identifier, and name, if possible."""
+    db_refs = standardize_db_refs({prefix: identifier})
+    db_ns, db_id = get_grounding(db_refs)
+    if db_ns is None or db_id is None:
+        return prefix, identifier, name
+    name = get_standard_name(db_refs) or name
+    return db_ns, db_id, name
+
+
 def norm_id(db_ns, db_id):
     identifiers_ns = identifiers.get_identifiers_ns(db_ns)
     identifiers_id = db_id

diff --git a/src/indra_cogex/sources/chembl/__init__.py b/src/indra_cogex/sources/chembl/__init__.py
@@ -3,18 +3,13 @@
 """Processor for ChEMBL."""
 
 import logging
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, Optional
 
 import bioversions
 import chembl_downloader
 from tqdm import tqdm
 
-from indra.ontology.standardize import (
-    get_grounding,
-    get_standard_name,
-    standardize_db_refs,
-)
-from indra_cogex.representation import Node, Relation
+from indra_cogex.representation import Node, Relation, standardize
 from indra_cogex.sources.processor import Processor
 
 logger = logging.getLogger(__name__)
@@ -53,7 +48,7 @@ def __init__(self, version: Optional[str] = None):
         for chembl_id, chembl_name in tqdm(
             chemical_df.values, unit_scale=True, desc="caching chemicals"
         ):
-            db_ns, db_id, name = self._standardize("CHEMBL", chembl_id, chembl_name)
+            db_ns, db_id, name = standardize("CHEMBL", chembl_id, chembl_name)
             self.chemicals[chembl_id] = Node(
                 db_ns,
                 db_id,
@@ -65,7 +60,7 @@ def __init__(self, version: Optional[str] = None):
         for mesh_id in tqdm(
             self.df.mesh_id.unique(), unit_scale=True, desc="caching indications"
         ):
-            db_ns, db_id, name = self._standardize("MESH", mesh_id)
+            db_ns, db_id, name = standardize("MESH", mesh_id)
             if name is None:
                 tqdm.write(f"no name found for MESH:{mesh_id}")
             self.indications[mesh_id] = Node(
@@ -75,16 +70,6 @@ def __init__(self, version: Optional[str] = None):
                 dict(name=name),
             )
 
-    def _standardize(
-        self, prefix: str, identifier: str, name: Optional[str] = None
-    ) -> Tuple[str, str, str]:
-        db_refs = standardize_db_refs({prefix: identifier})
-        db_ns, db_id = get_grounding(db_refs)
-        if db_ns is None or db_id is None:
-            return prefix, identifier, name
-        name = get_standard_name(db_refs) or name
-        return db_ns, db_id, name
-
     def get_nodes(self) -> Iterable[Node]:
         """Iterate over ChEMBL chemicals and indications"""
         yield from self.chemicals.values()