Skip to content

Commit

Permalink
Further consolidate CURIE/URI parsing (#257)
Browse files Browse the repository at this point in the history
- Remove custom parsing logic from relation and property parsing,
replace with Reference.from_curie_or_uri
- Add bioontologies URI upgrading into Reference.from_curie_or_uri
- Reuse bioregistry's identifier standardization code where possible
  • Loading branch information
cthoyt authored Dec 3, 2024
1 parent f392ffd commit d2af10a
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 26 deletions.
8 changes: 6 additions & 2 deletions src/pyobo/identifier_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from functools import wraps
from typing import ClassVar

import bioontologies.upgrade
import bioregistry
from curies import Reference, ReferenceTuple

Expand Down Expand Up @@ -74,7 +75,7 @@ def normalize_curie(
strict: bool = True,
ontology_prefix: str | None = None,
node: Reference | None = None,
) -> tuple[str, str] | tuple[None, None]:
) -> ReferenceTuple | tuple[None, None]:
"""Parse a string that looks like a CURIE.
:param curie: A compact uniform resource identifier (CURIE)
Expand All @@ -98,6 +99,9 @@ def normalize_curie(
if curie_has_blacklisted_suffix(curie):
return None, None

if reference_t := bioontologies.upgrade.upgrade(curie):
return reference_t

if curie.startswith("http:") or curie.startswith("https:"):
if reference := parse_iri(curie):
return reference.pair
Expand All @@ -120,7 +124,7 @@ def normalize_curie(

norm_node_prefix = bioregistry.normalize_prefix(prefix)
if norm_node_prefix:
return norm_node_prefix, identifier
return ReferenceTuple(norm_node_prefix, identifier)
elif strict:
raise MissingPrefixError(curie=curie, ontology_prefix=ontology_prefix, node=node)
else:
Expand Down
32 changes: 10 additions & 22 deletions src/pyobo/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
from pathlib import Path
from typing import Any

import bioontologies.upgrade
import bioregistry
import networkx as nx
from curies import ReferenceTuple
from more_itertools import pairwise
from tqdm.auto import tqdm

from .constants import DATE_FORMAT, PROVENANCE_PREFIXES
from .identifier_utils import normalize_curie
from .registries import curie_has_blacklisted_prefix, curie_is_blacklisted, remap_prefix
from .struct import (
Obo,
Expand All @@ -42,8 +40,6 @@

logger = logging.getLogger(__name__)

RELATION_REMAPPINGS: Mapping[str, ReferenceTuple] = bioontologies.upgrade.load()


def from_obo_path(
path: str | Path,
Expand Down Expand Up @@ -363,7 +359,9 @@ def iterate_graph_synonym_typedefs(
if reference is not None:
yield SynonymTypeDef(reference=reference)
elif strict:
raise ValueError(f"Could not parse {sid}")
raise ValueError(
f"[{ontology_prefix}] could not parse synonym type definition: {sid}"
)
else:
continue

Expand Down Expand Up @@ -657,15 +655,7 @@ def _get_prop(
if prop.startswith(sw):
identifier = prop.removeprefix(sw)
return default_reference(ontology_prefix, identifier)
if prop.startswith("http"):
# TODO upstream this into an omni-parser for references?
_pref, _id = bioregistry.parse_iri(prop)
if _pref and _id:
return Reference(prefix=_pref, identifier=_id)
else:
logger.warning("[%s] unable to handle property: %s", node.curie, prop)
return None
elif ":" not in prop:
if ":" not in prop:
return default_reference(ontology_prefix, prop)
else:
return Reference.from_curie_or_uri(
Expand Down Expand Up @@ -713,23 +703,21 @@ def iterate_node_relationships(
"""Extract relationships from a :mod:`obonet` node's data."""
for s in data.get("relationship", []):
relation_curie, target_curie = s.split(" ")
relation_prefix: str | None
relation_identifier: str | None
if relation_curie in RELATION_REMAPPINGS:
relation_prefix, relation_identifier = RELATION_REMAPPINGS[relation_curie]
else:
relation_prefix, relation_identifier = normalize_curie(

if ":" in relation_curie:
relation = Reference.from_curie_or_uri(
relation_curie, strict=strict, ontology_prefix=ontology_prefix, node=node
)
if relation_prefix is not None and relation_identifier is not None:
relation = Reference(prefix=relation_prefix, identifier=relation_identifier)
else:
relation = default_reference(ontology_prefix, relation_curie)
logger.debug(
"unhandled relation: %s. Parsing as default relation: %s",
relation_curie,
relation.curie,
)
if relation is None:
logger.warning("[%s] could not parse relation %s", node.curie, relation_curie)
continue

target = Reference.from_curie_or_uri(
target_curie, strict=strict, ontology_prefix=ontology_prefix, node=node
Expand Down
21 changes: 19 additions & 2 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from obonet import read_obo

from pyobo import Obo, Reference, Term
from pyobo.identifier_utils import UnparsableIRIError
from pyobo.reader import from_obonet, get_first_nonescaped_quote
from pyobo.struct import default_reference
from pyobo.struct.struct import DEFAULT_SYNONYM_TYPE
Expand Down Expand Up @@ -416,13 +417,29 @@ def test_property_unparsable_object(self) -> None:

def test_property_literal_url_unregistered(self) -> None:
"""Test using a full OBO PURL as the property."""
ontology = _read("""\
with self.assertRaises(UnparsableIRIError):
_read(
"""\
ontology: chebi
[Term]
id: CHEBI:1234
property_value: https://example.com/nope/nope CHEBI:5678
""",
strict=True,
)

ontology = _read(
"""\
ontology: chebi
[Term]
id: CHEBI:1234
property_value: https://example.com/nope/nope CHEBI:5678
""")
""",
strict=False,
)

term = self.get_only_term(ontology)
self.assertEqual(0, len(list(term.annotations_literal)))
self.assertEqual(0, len(list(term.annotations_object)))
Expand Down

0 comments on commit d2af10a

Please sign in to comment.