Merge branch 'main' into update-sssom-exporter

biopragmatics · Nov 24, 2024 · 9ddb1fd · 9ddb1fd
2 parents a259b96 + 86557b0
commit 9ddb1fd
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 35 deletions.
diff --git a/src/pyobo/getters.py b/src/pyobo/getters.py
@@ -11,6 +11,7 @@
 import time
 import typing
 import urllib.error
+import zipfile
 from collections import Counter
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from pathlib import Path
@@ -234,20 +235,15 @@ def _ensure_ontology_path(
     "eol": "unable to download, same source as atol",
     "hog": "unable to download",
     "vhog": "unable to download",
-    "ccf": "unable to download",
     "gorel": "unable to download",
     "dinto": "unable to download",
-    "mo": "unable to download",
-    "vario": "unable to download/build",
     "gainesville.core": "unable to download",
-    "mamo": "unable to download",
     "ato": "can't process",
     "emapa": "recently changed with EMAP... not sure what the difference is anymore",
     "kegg.genes": "needs fix",  # FIXME
     "kegg.genome": "needs fix",  # FIXME
     "kegg.pathway": "needs fix",  # FIXME
     "ensemblglossary": "uri is wrong",
-    "biolink": "too much junk",
     "epio": "content from fraunhofer is unreliable",
     "epso": "content from fraunhofer is unreliable",
     "gwascentral.phenotype": "website is down? or API changed?",  # FIXME
@@ -352,6 +348,9 @@ def iter_helper_helper(
         )
         try:
             yv = f(prefix, **kwargs)  # type:ignore
+        except (UnhandledFormatError, NoBuildError) as e:
+            # make sure this comes before the other runtimeerror catch
+            logger.warning("[%s] %s", prefix, e)
         except urllib.error.HTTPError as e:
             logger.warning("[%s] HTTP %s: unable to download %s", prefix, e.getcode(), e.geturl())
             if strict and not bioregistry.is_deprecated(prefix):
@@ -370,8 +369,6 @@ def iter_helper_helper(
             logger.warning("[drugbank] invalid credentials")
         except subprocess.CalledProcessError:
             logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix)
-        except (UnhandledFormatError, NoBuildError) as e:
-            logger.warning("[%s] %s", prefix, e)
         except ValueError as e:
             if _is_xml(e):
                 # this means that it tried doing parsing on an xml page
@@ -384,6 +381,9 @@ def iter_helper_helper(
                 logger.exception(
                     "[%s] got exception %s while parsing", prefix, e.__class__.__name__
                 )
+        except zipfile.BadZipFile as e:
+            # This can happen if there's an error on UMLS
+            logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
         except TypeError as e:
             logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
             if strict:

diff --git a/src/pyobo/reader.py b/src/pyobo/reader.py
@@ -165,7 +165,11 @@ def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> Obo:
         alt_ids = list(iterate_node_alt_ids(data, strict=strict))
         n_alt_ids += len(alt_ids)
 
-        parents = list(iterate_node_parents(data, node=reference, strict=strict))
+        parents = list(
+            iterate_node_parents(
+                data, node=reference, strict=strict, ontology_prefix=ontology_prefix
+            )
+        )
         n_parents += len(parents)
 
         synonyms = list(
@@ -377,6 +381,8 @@ def _extract_definition(
 
 def get_first_nonescaped_quote(s: str) -> int | None:
     """Get the first non-escaped quote."""
+    if not s:
+        return None
     if s[0] == '"':
         # special case first position
         return 0
@@ -523,7 +529,7 @@ def _handle_prop(
         )
         if obj_reference is None:
             logger.warning(
-                "[%s:%s] could not parse object: %s", node.curie, prop_reference.curie, value_type
+                "[%s - %s] could not parse object: %s", node.curie, prop_reference.curie, value_type
             )
             return None
         # TODO can we drop datatype from this?
@@ -579,10 +585,13 @@ def iterate_node_parents(
     *,
     node: Reference,
     strict: bool = True,
+    ontology_prefix: str,
 ) -> Iterable[Reference]:
     """Extract parents from a :mod:`obonet` node's data."""
     for parent_curie in data.get("is_a", []):
-        reference = Reference.from_curie(parent_curie, strict=strict)
+        reference = Reference.from_curie(
+            parent_curie, strict=strict, ontology_prefix=ontology_prefix, node=node
+        )
         if reference is None:
             logger.warning("[%s] could not parse parent curie: %s", node.curie, parent_curie)
             continue
@@ -612,7 +621,9 @@ def iterate_node_relationships(
         if relation_curie in RELATION_REMAPPINGS:
             relation_prefix, relation_identifier = RELATION_REMAPPINGS[relation_curie]
         else:
-            relation_prefix, relation_identifier = normalize_curie(relation_curie, strict=strict)
+            relation_prefix, relation_identifier = normalize_curie(
+                relation_curie, strict=strict, ontology_prefix=ontology_prefix, node=node
+            )
         if relation_prefix is not None and relation_identifier is not None:
             relation = Reference(prefix=relation_prefix, identifier=relation_identifier)
         else:
@@ -623,8 +634,9 @@ def iterate_node_relationships(
                 relation.curie,
             )
 
-        # TODO replace with omni-parser from :mod:`curies`
-        target = Reference.from_curie(target_curie, strict=strict)
+        target = Reference.from_curie(
+            target_curie, strict=strict, ontology_prefix=ontology_prefix, node=node
+        )
         if target is None:
             logger.warning("[%s] %s could not parse target %s", node.curie, relation, target_curie)
             continue

diff --git a/src/pyobo/registries/metaregistry.json b/src/pyobo/registries/metaregistry.json
@@ -27,7 +27,6 @@
       "Source:",
       "TermSpec:",
       "FormalCharge:",
-      "snap:Quality",
       "depicted:by",
       "http:http\\://www.pacificbiosciences.com/pdf/WP_Detecting_DNA_Base_Modifications_Using_SMRT_Sequencing.pdf",
       "XX:www.ensembl.org/info/genome/variation/predicted_data.html#consequences",
@@ -37,17 +36,16 @@
       "DDB:pf",
       "TS:0",
       "CTD:curators",
-      "IEDB:RV",
       "Tail:fat",
       "Pituitary:gland",
       "Compound:eye",
       "Lymph:node",
       "Lamina:propria",
       "Follicular:fluid",
-      "dph:GOC",
-      "gOC:dph",
-      "gOC:dph",
+      "group:OBI",
+      "GROUP:OBI",
       "ftp://ftp.ncbi.nih.gov/snp/specs/docsum_3.1.xsd",
+      "https://www.researchgate.net/scientific-contributions/Simon-Reeve-2162827703",
       "HPO:PCS",
       "HPO:ICE",
       "IEDB:BP",
@@ -276,7 +274,6 @@
         "BM:",
         "BSA:",
         "XtroDO:",
-        "nlx_subcell",
         "OGEM:",
         "ANISEED:",
         "BILS:",
@@ -302,7 +299,6 @@
         "LINCS_HMS",
         "CCLV",
         "Cosmic-CLP:",
-        "PubChem_Cell_line:CVCL_",
         "Rockland:",
         "CancerTools:",
         "Innoprot:"
@@ -357,22 +353,13 @@
     },
     "prefix": [
       "Image:",
-      "Category",
       "PERSON",
-      "similar to",
-      "modelled on",
       "SUBMITTER",
       "STRUCTURE_ChemicalName_IUPAC",
       "STRUCTURE_Formula",
       "stedman",
-      "value-type:",
-      "binary-data-type:MS\\",
       "PECO_GIT",
       "OBO_SF2_PECO",
-      "id-validation-regexp: ",
-      "id-validation-regex: ",
-      "search-url: ",
-      "regexp: ",
       "Germplasm:",
       "IUPAC:",
       "IUPHAR:GPCRListForward?",
@@ -382,18 +369,16 @@
       "FBC:",
       "RSC:",
       "DDB:",
-      "http:www",
       "NCBITaxon_Union",
       "PhenoScape:",
-      "INFOODs:",
-      "NLCD:",
       "TEMP:",
       "PO_GIT:"
     ],
     "suffix": [
       ".jpg",
       ".svg",
-      ".png"
+      ".png",
+      ".pdf"
     ]
   },
   "remappings": {
@@ -408,11 +393,13 @@
       "SNOMEDCT274897005": "SNOMEDCT:274897005",
       "GIOC:vw": "GOC:vw",
       ":has_start_point": "has_start_point",
+      "has_start_point:": "has_start_point",
       "dc-creator": "dc:creator",
       "PMI:17498297": "PMID:17498297",
       "HPO:SKOEHLER": "orcid:0000-0002-5316-1399",
       "HPO:skoehler": "orcid:0000-0002-5316-1399",
       "SIB:PG": "orcid:0000-0003-1813-6857",
+      "SIB:PG xsd:string": "orcid:0000-0003-1813-6857",
       "UBERON:cjm": "orcid:0000-0002-6601-2165",
       "part:of": "BFO:0000050",
       "gro:partOf": "BFO:0000050",
@@ -431,6 +418,7 @@
       "FOBI_050091": "FOBI:050091",
       "has:input": "RO:0002233",
       "has:output": "RO:0002234",
+      "dph:GOC":  "orcid:0000-0001-7476-6306",
       "Property:P1659": "wikidata:P1659",
       "vocab:crossSpeciesExactMatch": "semapv:crossSpeciesExactMatch",
       "definition:citation": "obo:efo#definition_citation",
@@ -442,6 +430,9 @@
       "BAO_": "BAO:",
       "TKG:TKG ": "TKG:",
       "KCB:KCB ": "KCB:",
+      "IEDB:RV": "orcid:0000-0001-8957-7612",
+      "IEDB:RandiVita xsd:string": "orcid:0000-0001-8957-7612",
+      "IEDB:RandiVita": "orcid:0000-0001-8957-7612",
       "CVCL_": "cellosaurus:CVCL_",
       "cancercelllines:CVCL_": "cellosaurus:CVCL_",
       "EGA:EGAS": "ega.study:EGAS",
@@ -509,6 +500,7 @@
       "OGI.owl:": "ogi:",
       "PANTHER:PTHR": "panther.family:PTHR",
       "vo/ontorat/PR:": "PR:",
+      "snap#": "snap:",
       "DC:0000": "diseaseclass:0000",
       "TS-": "caloha:",
       "terms1": "dcterms",
@@ -544,12 +536,47 @@
       "enm": {
         "Thesaurus:C": "NCIT:C"
       },
+      "srao": {
+        "topic:": "edam.topic:"
+      },
       "idocovid19": {"UniProtKN:":  "uniprot:"},
       "ito": {
         "format:": "edam.format:",
         "topic:": "edam.topic:",
         "operation:": "edam.operation:"
       },
+      "ehdaa2": {
+        "CS": "carnegie.stage:"
+      },
+      "sio": {
+        "ns2:": "skos:"
+      },
+      "phipo": {
+        "created:by": "dcterms:creator",
+        "created:date": "dcterms:created",
+        "creation:date": "dcterms:created"
+      },
+      "xlmod": {
+        "specificities:": "obo:xlmod#specificities",
+        "secondarySpecificities:": "obo:xlmod#secondarySpecificities",
+        "deadEndFormula:": "obo:xlmod#deadEndFormula",
+        "baseSpecificities:": "obo:xlmod#baseSpecificities",
+        "reactionSites:": "obo:xlmod#reactionSites",
+        "spacerLength:": "obo:xlmod#spacerLength",
+        "bridgeFormula:": "obo:xlmod#bridgeFormula",
+        "monoIsotopicMass:": "obo:xlmod#monoIsotopicMass",
+        "reporterMass:": "obo:xlmod#reporterMass",
+        "maxAbsorption:": "obo:xlmod#maxAbsorption",
+        "doubletDeltaMass:": "obo:xlmod#doubletDeltaMass",
+        "secondaryBaseSpecificities:": "obo:xlmod#secondaryBaseSpecificities",
+        "hydrophilicPEGchain:": "obo:xlmod#hydrophilicPEGchain",
+        "waveLengthRange:": "obo:xlmod#waveLengthRange",
+        "CID_Fragment:": "obo:xlmod#CID_Fragment"
+      },
+      "cellosaurus": {
+        "pgx:CVCL_": "cellosaurus:",
+        "PubChem_Cell_line:CVCL_": "cellosaurus:"
+      },
       "mcro": {
         "format:": "edam.format:",
         "topic:": "edam.topic:",

diff --git a/src/pyobo/sources/msigdb.py b/src/pyobo/sources/msigdb.py
@@ -38,7 +38,7 @@ class MSigDBGetter(Obo):
     """An ontology representation of MMSigDB's gene set nomenclature."""
 
     ontology = bioversions_key = PREFIX
-    typedefs = [has_participant]
+    typedefs = [has_participant, *(p for _, p in PROPERTIES)]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""

diff --git a/tests/test_get.py b/tests/test_get.py
@@ -203,7 +203,11 @@ def test_get_node_properties(self):
     def test_get_node_parents(self):
         """Test getting parents from a node in a :mod:`obonet` graph."""
         data = self.graph.nodes["CHEBI:51990"]
-        parents = list(iterate_node_parents(data, node=Reference(prefix="chebi", identifier="XXX")))
+        parents = list(
+            iterate_node_parents(
+                data, node=Reference(prefix="chebi", identifier="XXX"), ontology_prefix="chebi"
+            )
+        )
         self.assertEqual(2, len(parents))
         self.assertEqual({"24060", "51992"}, {parent.identifier for parent in parents})
         self.assertEqual({"chebi"}, {parent.prefix for parent in parents})

diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -30,6 +30,7 @@ class TestUtils(unittest.TestCase):
 
     def test_first_nonescaped_quote(self):
         """Test finding the first non-escaped double quote."""
+        self.assertIsNone(get_first_nonescaped_quote(""))
         self.assertEqual(0, get_first_nonescaped_quote('"'))
         self.assertEqual(0, get_first_nonescaped_quote('"abc'))
         self.assertEqual(0, get_first_nonescaped_quote('"abc"'))
@@ -726,3 +727,27 @@ def test_synonym_url(self) -> None:
             ],
             synonym.provenance,
         )
+
+    def test_parent(self) -> None:
+        """Test parsing out a parent."""
+        ontology = _read("""\
+            ontology: chebi
+            date: 20:11:2024 18:44
+
+            [Term]
+            id: CHEBI:1234
+            is_a: CHEBI:5678
+        """)
+        term = self.get_only_term(ontology)
+        self.assertEqual([Reference(prefix="CHEBI", identifier="5678")], term.parents)
+
+        ontology = _read("""\
+            ontology: chebi
+            date: 20:11:2024 18:44
+
+            [Term]
+            id: CHEBI:1234
+            is_a: http://purl.obolibrary.org/obo/CHEBI_5678
+        """)
+        term = self.get_only_term(ontology)
+        self.assertEqual([Reference(prefix="CHEBI", identifier="5678")], term.parents)