Merge branch 'main' into consistent-caching

biopragmatics · Dec 4, 2024 · 37299b8 · 37299b8
2 parents 2257327 + 7d8b1ca
commit 37299b8
Show file tree

Hide file tree

Showing 16 changed files with 1,383 additions and 695 deletions.
diff --git a/src/pyobo/getters.py b/src/pyobo/getters.py
@@ -34,6 +34,7 @@
 )
 from .identifier_utils import ParseError, wrap_norm_prefix
 from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
+from .reader import from_obo_path
 from .struct import Obo
 from .utils.io import get_writer
 from .utils.path import ensure_path, prefix_directory_join
@@ -69,6 +70,7 @@ def get_ontology(
     strict: bool = True,
     version: str | None = None,
     robot_check: bool = True,
+    upgrade: bool = True,
 ) -> Obo:
     """Get the OBO for a given graph.
 
@@ -80,6 +82,9 @@ def get_ontology(
     :param robot_check:
         If set to false, will send the ``--check=false`` command to ROBOT to disregard
         malformed ontology components. Necessary to load some ontologies like VO.
+    :param upgrade:
+        If set to true, will automatically upgrade relationships, such as
+        ``obo:chebi#part_of`` to ``BFO:0000051``
     :returns: An OBO object
 
     :raises OnlyOWLError: If the OBO foundry only has an OWL document for this resource.
@@ -131,18 +136,7 @@ def get_ontology(
     else:
         raise UnhandledFormatError(f"[{prefix}] unhandled ontology file format: {path.suffix}")
 
-    from .reader import from_obo_path
-
-    obo = from_obo_path(path, prefix=prefix, strict=strict)
-    if version is not None:
-        if obo.data_version is None:
-            logger.warning("[%s] did not have a version, overriding with %s", obo.ontology, version)
-            obo.data_version = version
-        elif obo.data_version != version:
-            logger.warning(
-                "[%s] had version %s, overriding with %s", obo.ontology, obo.data_version, version
-            )
-            obo.data_version = version
+    obo = from_obo_path(path, prefix=prefix, strict=strict, version=version, upgrade=upgrade)
     obo.write_default(force=force_process)
     return obo
 

diff --git a/src/pyobo/identifier_utils.py b/src/pyobo/identifier_utils.py
@@ -6,6 +6,7 @@
 from functools import wraps
 from typing import ClassVar
 
+import bioontologies.upgrade
 import bioregistry
 from curies import Reference, ReferenceTuple
 
@@ -74,7 +75,8 @@ def normalize_curie(
     strict: bool = True,
     ontology_prefix: str | None = None,
     node: Reference | None = None,
-) -> tuple[str, str] | tuple[None, None]:
+    upgrade: bool = True,
+) -> ReferenceTuple | tuple[None, None]:
     """Parse a string that looks like a CURIE.
 
     :param curie: A compact uniform resource identifier (CURIE)
@@ -85,11 +87,12 @@ def normalize_curie(
     - Normalizes the namespace
     - Checks against a blacklist for the entire curie, for the namespace, and for suffixes.
     """
-    # Remap the curie with the full list
-    curie = remap_full(curie)
+    if upgrade:
+        # Remap the curie with the full list
+        curie = remap_full(curie)
 
-    # Remap node's prefix (if necessary)
-    curie = remap_prefix(curie, ontology_prefix=ontology_prefix)
+        # Remap node's prefix (if necessary)
+        curie = remap_prefix(curie, ontology_prefix=ontology_prefix)
 
     if curie_is_blacklisted(curie):
         return None, None
@@ -98,6 +101,9 @@ def normalize_curie(
     if curie_has_blacklisted_suffix(curie):
         return None, None
 
+    if upgrade and (reference_t := bioontologies.upgrade.upgrade(curie)):
+        return reference_t
+
     if curie.startswith("http:") or curie.startswith("https:"):
         if reference := parse_iri(curie):
             return reference.pair
@@ -114,13 +120,10 @@ def normalize_curie(
             logger.debug(f"could not split CURIE on colon: {curie}")
         return None, None
 
-    # remove redundant prefix
-    if identifier.casefold().startswith(f"{prefix.casefold()}:"):
-        identifier = identifier[len(prefix) + 1 :]
-
     norm_node_prefix = bioregistry.normalize_prefix(prefix)
     if norm_node_prefix:
-        return norm_node_prefix, identifier
+        identifier = bioregistry.standardize_identifier(norm_node_prefix, identifier)
+        return ReferenceTuple(norm_node_prefix, identifier)
     elif strict:
         raise MissingPrefixError(curie=curie, ontology_prefix=ontology_prefix, node=node)
     else:

diff --git a/src/pyobo/normalizer.py b/src/pyobo/normalizer.py
@@ -82,20 +82,17 @@ def _iterate_synonyms_to_identifiers(
         id_to_synonyms: Mapping[str, Iterable[str]],
         remove_prefix: str | None = None,
     ) -> Iterable[tuple[str, str]]:
-        if remove_prefix is not None:
-            remove_prefix = f'{remove_prefix.lower().rstrip(":")}:'
-
         # Add name
         for identifier, name in id_to_name.items():
-            if remove_prefix and identifier.lower().startswith(remove_prefix):
-                identifier = identifier[len(remove_prefix) :]
+            if remove_prefix:
+                identifier = bioregistry.standardize_identifier(remove_prefix, identifier)
 
             yield name, identifier
 
         # Add synonyms
         for identifier, synonyms in id_to_synonyms.items():
-            if remove_prefix and identifier.lower().startswith(remove_prefix):
-                identifier = identifier[len(remove_prefix) :]
+            if remove_prefix:
+                identifier = bioregistry.standardize_identifier(remove_prefix, identifier)
 
             for synonym in synonyms:
                 # it might overwrite but this is probably always due to alternate ids