Skip to content

Commit

Permalink
Merge branch 'main' into update-sssom-exporter
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Nov 24, 2024
2 parents a259b96 + 86557b0 commit 9ddb1fd
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 35 deletions.
14 changes: 7 additions & 7 deletions src/pyobo/getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import time
import typing
import urllib.error
import zipfile
from collections import Counter
from collections.abc import Callable, Iterable, Mapping, Sequence
from pathlib import Path
Expand Down Expand Up @@ -234,20 +235,15 @@ def _ensure_ontology_path(
"eol": "unable to download, same source as atol",
"hog": "unable to download",
"vhog": "unable to download",
"ccf": "unable to download",
"gorel": "unable to download",
"dinto": "unable to download",
"mo": "unable to download",
"vario": "unable to download/build",
"gainesville.core": "unable to download",
"mamo": "unable to download",
"ato": "can't process",
"emapa": "recently changed with EMAP... not sure what the difference is anymore",
"kegg.genes": "needs fix", # FIXME
"kegg.genome": "needs fix", # FIXME
"kegg.pathway": "needs fix", # FIXME
"ensemblglossary": "uri is wrong",
"biolink": "too much junk",
"epio": "content from fraunhofer is unreliable",
"epso": "content from fraunhofer is unreliable",
"gwascentral.phenotype": "website is down? or API changed?", # FIXME
Expand Down Expand Up @@ -352,6 +348,9 @@ def iter_helper_helper(
)
try:
yv = f(prefix, **kwargs) # type:ignore
except (UnhandledFormatError, NoBuildError) as e:
# make sure this comes before the other runtimeerror catch
logger.warning("[%s] %s", prefix, e)
except urllib.error.HTTPError as e:
logger.warning("[%s] HTTP %s: unable to download %s", prefix, e.getcode(), e.geturl())
if strict and not bioregistry.is_deprecated(prefix):
Expand All @@ -370,8 +369,6 @@ def iter_helper_helper(
logger.warning("[drugbank] invalid credentials")
except subprocess.CalledProcessError:
logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix)
except (UnhandledFormatError, NoBuildError) as e:
logger.warning("[%s] %s", prefix, e)
except ValueError as e:
if _is_xml(e):
# this means that it tried doing parsing on an xml page
Expand All @@ -384,6 +381,9 @@ def iter_helper_helper(
logger.exception(
"[%s] got exception %s while parsing", prefix, e.__class__.__name__
)
except zipfile.BadZipFile as e:
# This can happen if there's an error on UMLS
logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
except TypeError as e:
logger.exception("[%s] got exception %s while parsing", prefix, e.__class__.__name__)
if strict:
Expand Down
24 changes: 18 additions & 6 deletions src/pyobo/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,11 @@ def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> Obo:
alt_ids = list(iterate_node_alt_ids(data, strict=strict))
n_alt_ids += len(alt_ids)

parents = list(iterate_node_parents(data, node=reference, strict=strict))
parents = list(
iterate_node_parents(
data, node=reference, strict=strict, ontology_prefix=ontology_prefix
)
)
n_parents += len(parents)

synonyms = list(
Expand Down Expand Up @@ -377,6 +381,8 @@ def _extract_definition(

def get_first_nonescaped_quote(s: str) -> int | None:
"""Get the first non-escaped quote."""
if not s:
return None
if s[0] == '"':
# special case first position
return 0
Expand Down Expand Up @@ -523,7 +529,7 @@ def _handle_prop(
)
if obj_reference is None:
logger.warning(
"[%s:%s] could not parse object: %s", node.curie, prop_reference.curie, value_type
"[%s - %s] could not parse object: %s", node.curie, prop_reference.curie, value_type
)
return None
# TODO can we drop datatype from this?
Expand Down Expand Up @@ -579,10 +585,13 @@ def iterate_node_parents(
*,
node: Reference,
strict: bool = True,
ontology_prefix: str,
) -> Iterable[Reference]:
"""Extract parents from a :mod:`obonet` node's data."""
for parent_curie in data.get("is_a", []):
reference = Reference.from_curie(parent_curie, strict=strict)
reference = Reference.from_curie(
parent_curie, strict=strict, ontology_prefix=ontology_prefix, node=node
)
if reference is None:
logger.warning("[%s] could not parse parent curie: %s", node.curie, parent_curie)
continue
Expand Down Expand Up @@ -612,7 +621,9 @@ def iterate_node_relationships(
if relation_curie in RELATION_REMAPPINGS:
relation_prefix, relation_identifier = RELATION_REMAPPINGS[relation_curie]
else:
relation_prefix, relation_identifier = normalize_curie(relation_curie, strict=strict)
relation_prefix, relation_identifier = normalize_curie(
relation_curie, strict=strict, ontology_prefix=ontology_prefix, node=node
)
if relation_prefix is not None and relation_identifier is not None:
relation = Reference(prefix=relation_prefix, identifier=relation_identifier)
else:
Expand All @@ -623,8 +634,9 @@ def iterate_node_relationships(
relation.curie,
)

# TODO replace with omni-parser from :mod:`curies`
target = Reference.from_curie(target_curie, strict=strict)
target = Reference.from_curie(
target_curie, strict=strict, ontology_prefix=ontology_prefix, node=node
)
if target is None:
logger.warning("[%s] %s could not parse target %s", node.curie, relation, target_curie)
continue
Expand Down
67 changes: 47 additions & 20 deletions src/pyobo/registries/metaregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
"Source:",
"TermSpec:",
"FormalCharge:",
"snap:Quality",
"depicted:by",
"http:http\\://www.pacificbiosciences.com/pdf/WP_Detecting_DNA_Base_Modifications_Using_SMRT_Sequencing.pdf",
"XX:www.ensembl.org/info/genome/variation/predicted_data.html#consequences",
Expand All @@ -37,17 +36,16 @@
"DDB:pf",
"TS:0",
"CTD:curators",
"IEDB:RV",
"Tail:fat",
"Pituitary:gland",
"Compound:eye",
"Lymph:node",
"Lamina:propria",
"Follicular:fluid",
"dph:GOC",
"gOC:dph",
"gOC:dph",
"group:OBI",
"GROUP:OBI",
"ftp://ftp.ncbi.nih.gov/snp/specs/docsum_3.1.xsd",
"https://www.researchgate.net/scientific-contributions/Simon-Reeve-2162827703",
"HPO:PCS",
"HPO:ICE",
"IEDB:BP",
Expand Down Expand Up @@ -276,7 +274,6 @@
"BM:",
"BSA:",
"XtroDO:",
"nlx_subcell",
"OGEM:",
"ANISEED:",
"BILS:",
Expand All @@ -302,7 +299,6 @@
"LINCS_HMS",
"CCLV",
"Cosmic-CLP:",
"PubChem_Cell_line:CVCL_",
"Rockland:",
"CancerTools:",
"Innoprot:"
Expand Down Expand Up @@ -357,22 +353,13 @@
},
"prefix": [
"Image:",
"Category",
"PERSON",
"similar to",
"modelled on",
"SUBMITTER",
"STRUCTURE_ChemicalName_IUPAC",
"STRUCTURE_Formula",
"stedman",
"value-type:",
"binary-data-type:MS\\",
"PECO_GIT",
"OBO_SF2_PECO",
"id-validation-regexp: ",
"id-validation-regex: ",
"search-url: ",
"regexp: ",
"Germplasm:",
"IUPAC:",
"IUPHAR:GPCRListForward?",
Expand All @@ -382,18 +369,16 @@
"FBC:",
"RSC:",
"DDB:",
"http:www",
"NCBITaxon_Union",
"PhenoScape:",
"INFOODs:",
"NLCD:",
"TEMP:",
"PO_GIT:"
],
"suffix": [
".jpg",
".svg",
".png"
".png",
".pdf"
]
},
"remappings": {
Expand All @@ -408,11 +393,13 @@
"SNOMEDCT274897005": "SNOMEDCT:274897005",
"GIOC:vw": "GOC:vw",
":has_start_point": "has_start_point",
"has_start_point:": "has_start_point",
"dc-creator": "dc:creator",
"PMI:17498297": "PMID:17498297",
"HPO:SKOEHLER": "orcid:0000-0002-5316-1399",
"HPO:skoehler": "orcid:0000-0002-5316-1399",
"SIB:PG": "orcid:0000-0003-1813-6857",
"SIB:PG xsd:string": "orcid:0000-0003-1813-6857",
"UBERON:cjm": "orcid:0000-0002-6601-2165",
"part:of": "BFO:0000050",
"gro:partOf": "BFO:0000050",
Expand All @@ -431,6 +418,7 @@
"FOBI_050091": "FOBI:050091",
"has:input": "RO:0002233",
"has:output": "RO:0002234",
"dph:GOC": "orcid:0000-0001-7476-6306",
"Property:P1659": "wikidata:P1659",
"vocab:crossSpeciesExactMatch": "semapv:crossSpeciesExactMatch",
"definition:citation": "obo:efo#definition_citation",
Expand All @@ -442,6 +430,9 @@
"BAO_": "BAO:",
"TKG:TKG ": "TKG:",
"KCB:KCB ": "KCB:",
"IEDB:RV": "orcid:0000-0001-8957-7612",
"IEDB:RandiVita xsd:string": "orcid:0000-0001-8957-7612",
"IEDB:RandiVita": "orcid:0000-0001-8957-7612",
"CVCL_": "cellosaurus:CVCL_",
"cancercelllines:CVCL_": "cellosaurus:CVCL_",
"EGA:EGAS": "ega.study:EGAS",
Expand Down Expand Up @@ -509,6 +500,7 @@
"OGI.owl:": "ogi:",
"PANTHER:PTHR": "panther.family:PTHR",
"vo/ontorat/PR:": "PR:",
"snap#": "snap:",
"DC:0000": "diseaseclass:0000",
"TS-": "caloha:",
"terms1": "dcterms",
Expand Down Expand Up @@ -544,12 +536,47 @@
"enm": {
"Thesaurus:C": "NCIT:C"
},
"srao": {
"topic:": "edam.topic:"
},
"idocovid19": {"UniProtKN:": "uniprot:"},
"ito": {
"format:": "edam.format:",
"topic:": "edam.topic:",
"operation:": "edam.operation:"
},
"ehdaa2": {
"CS": "carnegie.stage:"
},
"sio": {
"ns2:": "skos:"
},
"phipo": {
"created:by": "dcterms:creator",
"created:date": "dcterms:created",
"creation:date": "dcterms:created"
},
"xlmod": {
"specificities:": "obo:xlmod#specificities",
"secondarySpecificities:": "obo:xlmod#secondarySpecificities",
"deadEndFormula:": "obo:xlmod#deadEndFormula",
"baseSpecificities:": "obo:xlmod#baseSpecificities",
"reactionSites:": "obo:xlmod#reactionSites",
"spacerLength:": "obo:xlmod#spacerLength",
"bridgeFormula:": "obo:xlmod#bridgeFormula",
"monoIsotopicMass:": "obo:xlmod#monoIsotopicMass",
"reporterMass:": "obo:xlmod#reporterMass",
"maxAbsorption:": "obo:xlmod#maxAbsorption",
"doubletDeltaMass:": "obo:xlmod#doubletDeltaMass",
"secondaryBaseSpecificities:": "obo:xlmod#secondaryBaseSpecificities",
"hydrophilicPEGchain:": "obo:xlmod#hydrophilicPEGchain",
"waveLengthRange:": "obo:xlmod#waveLengthRange",
"CID_Fragment:": "obo:xlmod#CID_Fragment"
},
"cellosaurus": {
"pgx:CVCL_": "cellosaurus:",
"PubChem_Cell_line:CVCL_": "cellosaurus:"
},
"mcro": {
"format:": "edam.format:",
"topic:": "edam.topic:",
Expand Down
2 changes: 1 addition & 1 deletion src/pyobo/sources/msigdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class MSigDBGetter(Obo):
"""An ontology representation of MMSigDB's gene set nomenclature."""

ontology = bioversions_key = PREFIX
typedefs = [has_participant]
typedefs = [has_participant, *(p for _, p in PROPERTIES)]

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
Expand Down
6 changes: 5 additions & 1 deletion tests/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,11 @@ def test_get_node_properties(self):
def test_get_node_parents(self):
"""Test getting parents from a node in a :mod:`obonet` graph."""
data = self.graph.nodes["CHEBI:51990"]
parents = list(iterate_node_parents(data, node=Reference(prefix="chebi", identifier="XXX")))
parents = list(
iterate_node_parents(
data, node=Reference(prefix="chebi", identifier="XXX"), ontology_prefix="chebi"
)
)
self.assertEqual(2, len(parents))
self.assertEqual({"24060", "51992"}, {parent.identifier for parent in parents})
self.assertEqual({"chebi"}, {parent.prefix for parent in parents})
Expand Down
25 changes: 25 additions & 0 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class TestUtils(unittest.TestCase):

def test_first_nonescaped_quote(self):
"""Test finding the first non-escaped double quote."""
self.assertIsNone(get_first_nonescaped_quote(""))
self.assertEqual(0, get_first_nonescaped_quote('"'))
self.assertEqual(0, get_first_nonescaped_quote('"abc'))
self.assertEqual(0, get_first_nonescaped_quote('"abc"'))
Expand Down Expand Up @@ -726,3 +727,27 @@ def test_synonym_url(self) -> None:
],
synonym.provenance,
)

def test_parent(self) -> None:
"""Test parsing out a parent."""
ontology = _read("""\
ontology: chebi
date: 20:11:2024 18:44
[Term]
id: CHEBI:1234
is_a: CHEBI:5678
""")
term = self.get_only_term(ontology)
self.assertEqual([Reference(prefix="CHEBI", identifier="5678")], term.parents)

ontology = _read("""\
ontology: chebi
date: 20:11:2024 18:44
[Term]
id: CHEBI:1234
is_a: http://purl.obolibrary.org/obo/CHEBI_5678
""")
term = self.get_only_term(ontology)
self.assertEqual([Reference(prefix="CHEBI", identifier="5678")], term.parents)

0 comments on commit 9ddb1fd

Please sign in to comment.