From d3c975219c545271b2aeb36585b0df00b5815f96 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 25 Nov 2024 11:01:49 +0100 Subject: [PATCH] Simplify node iteration --- src/pyobo/reader.py | 42 ++++++++++------------------------- src/pyobo/struct/reference.py | 3 +++ tests/test_get.py | 25 ++++++++++++++++----- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/pyobo/reader.py b/src/pyobo/reader.py index 7beb687e..ddfe10f3 100644 --- a/src/pyobo/reader.py +++ b/src/pyobo/reader.py @@ -113,22 +113,6 @@ def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> Obo: f"[{ontology_prefix}] will not accept slash in data version: {data_version}" ) - #: Parsed CURIEs to references (even external ones) - reference_it = ( - Reference( - prefix=prefix, - identifier=bioregistry.standardize_identifier(prefix, identifier), - # if name isn't available, it means its external to this ontology - name=data.get("name"), - ) - for prefix, identifier, data in _iter_obo_graph( - graph=graph, strict=strict, ontology_prefix=ontology_prefix - ) - ) - references: Mapping[ReferenceTuple, Reference] = { - reference.pair: reference for reference in reference_it - } - #: CURIEs to typedefs typedefs: Mapping[ReferenceTuple, TypeDef] = { typedef.pair: typedef @@ -145,18 +129,16 @@ def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> Obo: missing_typedefs: set[ReferenceTuple] = set() terms = [] n_alt_ids, n_parents, n_synonyms, n_relations, n_properties, n_xrefs = 0, 0, 0, 0, 0, 0 - for prefix, identifier, data in _iter_obo_graph( + n_references = 0 + for reference, data in _iter_obo_graph( graph=graph, strict=strict, ontology_prefix=ontology_prefix ): - if prefix != ontology_prefix or not data: + if reference.prefix != ontology_prefix or not data: continue - - identifier = bioregistry.standardize_identifier(prefix, identifier) - reference = references[ReferenceTuple(ontology_prefix, identifier)] + n_references += 1 node_xrefs = list( iterate_node_xrefs( - prefix=prefix, data=data, strict=strict, ontology_prefix=ontology_prefix, @@ -245,7 +227,7 @@ def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> Obo: terms.append(term) logger.info( - f"[{ontology_prefix}] got {len(references):,} references, {len(typedefs):,} typedefs, {len(terms):,} terms," + f"[{ontology_prefix}] got {n_references:,} references, {len(typedefs):,} typedefs, {len(terms):,} terms," f" {n_alt_ids:,} alt ids, {n_parents:,} parents, {n_synonyms:,} synonyms, {n_xrefs:,} xrefs," f" {n_relations:,} relations, and {n_properties:,} properties", ) @@ -282,13 +264,14 @@ def _iter_obo_graph( *, strict: bool = True, ontology_prefix: str | None = None, -) -> Iterable[tuple[str, str, Mapping[str, Any]]]: +) -> Iterable[tuple[Reference, Mapping[str, Any]]]: """Iterate over the nodes in the graph with the prefix stripped (if it's there).""" for node, data in graph.nodes(data=True): - prefix, identifier = normalize_curie(node, strict=strict, ontology_prefix=ontology_prefix) - if prefix is None or identifier is None: - continue - yield prefix, identifier, data + node = Reference.from_curie_or_uri( + node, strict=strict, ontology_prefix=ontology_prefix, name=data.get("name") + ) + if node: + yield node, data def _get_date(graph, ontology_prefix: str) -> datetime | None: @@ -698,7 +681,6 @@ def iterate_node_relationships( def iterate_node_xrefs( *, - prefix: str, data: Mapping[str, Any], strict: bool = True, ontology_prefix: str | None, @@ -717,7 +699,7 @@ def iterate_node_xrefs( if split_space: _xref_split = xref.split(" ", 1) if _xref_split[1][0] not in {'"', "("}: - logger.debug("[%s] Problem with space in xref %s", prefix, xref) + logger.debug("[%s] Problem with space in xref %s", node.curie, xref) continue xref = _xref_split[0] diff --git a/src/pyobo/struct/reference.py b/src/pyobo/struct/reference.py index d10db650..75bf9a96 100644 --- a/src/pyobo/struct/reference.py +++ b/src/pyobo/struct/reference.py @@ -95,6 +95,9 @@ def from_curie_or_uri( ) if prefix is None or identifier is None: return None + + identifier = bioregistry.standardize_identifier(prefix, identifier) + if name is None and auto: from ..api import get_name diff --git a/tests/test_get.py b/tests/test_get.py index b5127ece..2a49eeee 100644 --- a/tests/test_get.py +++ b/tests/test_get.py @@ -85,7 +85,7 @@ def test_extract_definition(self): ]: with self.subTest(s=s): actual_text, actual_references = _extract_definition( - s, node=Reference(prefix="chebi", identifier="XXX") + s, node=Reference(prefix="chebi", identifier="XXX"), ontology_prefix="chebi" ) self.assertEqual(expected_text, actual_text) self.assertEqual(expected_references, actual_references) @@ -95,7 +95,10 @@ def test_extract_definition_with_escapes(self): expected_text = """The canonical 3' splice site has the sequence "AG".""" s = """"The canonical 3' splice site has the sequence \\"AG\\"." [PMID:1234]""" actual_text, actual_references = _extract_definition( - s, strict=True, node=Reference(prefix="chebi", identifier="XXX") + s, + strict=True, + node=Reference(prefix="chebi", identifier="XXX"), + ontology_prefix="chebi", ) self.assertEqual(expected_text, actual_text) self.assertEqual([Reference(prefix="pubmed", identifier="1234")], actual_references) @@ -159,7 +162,10 @@ def test_extract_synonym(self): ]: with self.subTest(s=text): actual_synonym = _extract_synonym( - text, synoynym_typedefs, node=Reference(prefix="chebi", identifier="XXX") + text, + synoynym_typedefs, + node=Reference(prefix="chebi", identifier="XXX"), + ontology_prefix="chebi", ) self.assertIsInstance(actual_synonym, Synonym) self.assertEqual(expected_synonym, actual_synonym) @@ -175,7 +181,10 @@ def test_get_node_synonyms(self): data = self.graph.nodes["CHEBI:51990"] synonyms = list( iterate_node_synonyms( - data, synoynym_typedefs, node=Reference(prefix="chebi", identifier="XXX") + data, + synoynym_typedefs, + node=Reference(prefix="chebi", identifier="XXX"), + ontology_prefix="chebi", ) ) self.assertEqual(1, len(synonyms)) @@ -215,7 +224,13 @@ def test_get_node_parents(self): def test_get_node_xrefs(self): """Test getting parents from a node in a :mod:`obonet` graph.""" data = self.graph.nodes["CHEBI:51990"] - xrefs = list(iterate_node_xrefs(prefix="chebi", data=data)) + xrefs = list( + iterate_node_xrefs( + data=data, + ontology_prefix="chebi", + node=Reference(prefix="chebi", identifier="51990"), + ) + ) self.assertEqual(7, len(xrefs)) # NOTE the prefixes are remapped by Bioregistry self.assertEqual({"pubmed", "cas", "reaxys"}, {xref.prefix for xref in xrefs})