Cleanup definitions part 1

biopragmatics · Nov 30, 2023 · b1158d5 · b1158d5
1 parent b73e986
commit b1158d5
Show file tree

Hide file tree

Showing 14 changed files with 54 additions and 27 deletions.
diff --git a/src/pyobo/sources/cgnc.py b/src/pyobo/sources/cgnc.py
@@ -8,6 +8,7 @@
 import pandas as pd
 
 from pyobo.struct import Obo, Reference, Term, from_species
+from pyobo.struct.typedef import exact_match
 from pyobo.utils.path import ensure_df
 
 __all__ = [
@@ -25,7 +26,7 @@ class CGNCGetter(Obo):
 
     ontology = PREFIX
     dynamic_version = True
-    typedefs = [from_species]
+    typedefs = [from_species, exact_match]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""

diff --git a/src/pyobo/sources/chembl.py b/src/pyobo/sources/chembl.py
@@ -12,7 +12,7 @@
 import chembl_downloader
 
 from pyobo.struct import Obo, Reference, Term
-from pyobo.struct.typedef import has_inchi, has_smiles
+from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
 
 __all__ = [
     "ChEMBLCompoundGetter",
@@ -45,6 +45,7 @@ class ChEMBLCompoundGetter(Obo):
 
     ontology = "chembl.compound"
     bioversions_key = "chembl"
+    typedefs = [exact_match]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""

diff --git a/src/pyobo/sources/depmap.py b/src/pyobo/sources/depmap.py
@@ -8,6 +8,7 @@
 import pystow
 
 from pyobo import Obo, Reference, Term
+from pyobo.struct.typedef import exact_match
 
 __all__ = [
     "get_obo",
@@ -23,6 +24,7 @@ class DepMapGetter(Obo):
 
     ontology = bioversions_key = PREFIX
     data_version = VERSION
+    typedefs = [exact_match]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""

diff --git a/src/pyobo/sources/drugcentral.py b/src/pyobo/sources/drugcentral.py
@@ -12,7 +12,7 @@
 from tqdm.auto import tqdm
 
 from pyobo.struct import Obo, Reference, Synonym, Term
-from pyobo.struct.typedef import has_inchi, has_smiles
+from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
 
 __all__ = [
     "DrugCentralGetter",
@@ -34,6 +34,7 @@ class DrugCentralGetter(Obo):
     """An ontology representation of the DrugCentral database."""
 
     ontology = bioversions_key = PREFIX
+    typedefs = [exact_match]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""

diff --git a/src/pyobo/sources/hgnc.py b/src/pyobo/sources/hgnc.py
@@ -27,6 +27,7 @@
     orthologous,
     transcribes_to,
 )
+from pyobo.struct.typedef import exact_match
 from pyobo.utils.path import ensure_path, prefix_directory_join
 
 __all__ = [
@@ -212,6 +213,7 @@ class HGNCGetter(Obo):
         transcribes_to,
         orthologous,
         member_of,
+        exact_match,
     ]
     idspaces = IDSPACES
     synonym_typedefs = [

diff --git a/src/pyobo/sources/mgi.py b/src/pyobo/sources/mgi.py
@@ -9,6 +9,8 @@
 import pandas as pd
 from tqdm.auto import tqdm
 
+from pyobo.struct.typedef import exact_match
+
 from ..struct import (
     Obo,
     Reference,
@@ -37,7 +39,7 @@ class MGIGetter(Obo):
 
     ontology = PREFIX
     dynamic_version = True
-    typedefs = [from_species, has_gene_product, transcribes_to]
+    typedefs = [from_species, has_gene_product, transcribes_to, exact_match]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""

diff --git a/src/pyobo/sources/mirbase_family.py b/src/pyobo/sources/mirbase_family.py
@@ -40,7 +40,9 @@ def get_obo(force: bool = False) -> Obo:
 def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
     """Get miRBase family terms."""
     df = get_df(version, force=force)
-    for family_id, name, mirna_id, mirna_name in tqdm(df.values, total=len(df.index)):
+    for family_id, name, mirna_id, mirna_name in tqdm(
+        df.values, total=len(df.index), unit_scale=True, desc="miRBase Family"
+    ):
         term = Term(
             reference=Reference(prefix=PREFIX, identifier=family_id, name=name),
         )
@@ -65,4 +67,4 @@ def get_df(version: str, force: bool = False) -> pd.DataFrame:
 
 
 if __name__ == "__main__":
-    get_obo().write_default(use_tqdm=True)
+    get_obo().write_default(use_tqdm=True, write_obo=True, force=True)
diff --git a/src/pyobo/sources/npass.py b/src/pyobo/sources/npass.py
@@ -77,7 +77,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
                 logger.debug("multiple cids for %s: %s", identifier, pubchem_compound_ids)
             for pubchem_compound_id in pubchem_compound_ids:
                 term.append_xref(
-                    Reference(prefix="pubchem.compound", identifier=pubchem_compound_id)
+                    Reference(prefix="pubchem.compound", identifier=pubchem_compound_id.strip())
                 )
 
         for synonym in [iupac]:

diff --git a/src/pyobo/sources/ror.py b/src/pyobo/sources/ror.py
@@ -8,7 +8,8 @@
 import zenodo_client
 from tqdm.auto import tqdm
 
-from pyobo.struct import Obo, Reference, SynonymTypeDef, Term, TypeDef
+from pyobo.struct import Obo, Reference, Term, TypeDef
+from pyobo.struct.struct import acronym
 
 PREFIX = "ror"
 ROR_ZENODO_RECORD_ID = "10086202"
@@ -21,8 +22,6 @@
 SUCCESSOR = Reference(prefix="BFO", identifier="0000063")
 PREDECESSOR = Reference(prefix="BFO", identifier="0000062")
 
-ACRONYM = SynonymTypeDef(reference=Reference(prefix="omo", identifier="0003000", name="acronym"))
-
 RMAP = {
     "Related": TypeDef.from_triple("rdfs", "seeAlso"),
     "Child": TypeDef(HAS_PART),
@@ -45,7 +44,7 @@ class RORGetter(Obo):
 
     ontology = bioregistry_key = PREFIX
     typedefs = list(RMAP.values())
-    synonym_typedefs = [ACRONYM]
+    synonym_typedefs = [acronym]
     idspaces = {
         "ror": "https://ror.org/",
         "geonames": "https://www.geonames.org/",
@@ -110,8 +109,8 @@ def iterate_ror_terms(*, force: bool = False) -> Iterable[Term]:
             if synonym.startswith("The "):
                 term.append_synonym(synonym.removeprefix("The "))
 
-        for acronym in record.get("acronyms", []):
-            term.append_synonym(acronym, type=ACRONYM)
+        for acronym_synonym in record.get("acronyms", []):
+            term.append_synonym(acronym_synonym, type=acronym)
 
         for prefix, xref_data in record.get("external_ids", {}).items():
             if prefix == "OrgRef":

diff --git a/src/pyobo/sources/sgd.py b/src/pyobo/sources/sgd.py
@@ -5,7 +5,7 @@
 from typing import Iterable
 from urllib.parse import unquote_plus
 
-from ..struct import Obo, Reference, Synonym, SynonymTypeDef, Term, from_species
+from ..struct import Obo, Reference, Synonym, Term, from_species
 from ..utils.path import ensure_tar_df
 
 __all__ = [
@@ -21,15 +21,12 @@
 )
 INNER_PATH = "S288C_reference_genome_R64-2-1_20150113/saccharomyces_cerevisiae_R64-2-1_20150113.gff"
 
-alias_type = SynonymTypeDef.from_text("alias")
-
 
 class SGDGetter(Obo):
     """An ontology representation of SGD's yeast gene nomenclature."""
 
     bioversions_key = ontology = PREFIX
     typedefs = [from_species]
-    synonym_typedefs = [alias_type]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms for SGD."""
@@ -68,7 +65,7 @@ def get_terms(ontology: Obo, force: bool = False) -> Iterable[Term]:
         aliases = d.get("Alias")
         if aliases:
             for alias in aliases.split(","):
-                synonyms.append(Synonym(name=unquote_plus(alias), type=alias_type))
+                synonyms.append(Synonym(name=unquote_plus(alias)))
 
         term = Term(
             reference=Reference(prefix=PREFIX, identifier=identifier, name=name),

diff --git a/src/pyobo/sources/slm.py b/src/pyobo/sources/slm.py
@@ -7,8 +7,9 @@
 import pandas as pd
 from tqdm.auto import tqdm
 
-from pyobo import Obo, Reference, SynonymTypeDef, Term
-from pyobo.struct.typedef import has_inchi, has_smiles
+from pyobo import Obo, Reference, Term
+from pyobo.struct.struct import abbreviation as abbreviation_typedef
+from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
 from pyobo.utils.path import ensure_df
 
 __all__ = [
@@ -38,14 +39,13 @@
     "PMID",
 ]
 
-abreviation_type = SynonymTypeDef.from_text("abbreviation")
-
 
 class SLMGetter(Obo):
     """An ontology representation of SwissLipid's lipid nomenclature."""
 
     ontology = bioversions_key = PREFIX
-    synonym_typedefs = [abreviation_type]
+    typedefs = [exact_match]
+    synonym_typedefs = [abbreviation_typedef]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""
@@ -94,7 +94,7 @@ def iter_terms(version: str, force: bool = False):
         if pd.notna(level):
             term.append_property("level", level)
         if pd.notna(abbreviation):
-            term.append_synonym(abbreviation, type=abreviation_type)
+            term.append_synonym(abbreviation, type=abbreviation_typedef)
         if pd.notna(synonyms):
             for synonym in synonyms.split("|"):
                 term.append_synonym(synonym.strip())

diff --git a/src/pyobo/sources/zfin.py b/src/pyobo/sources/zfin.py
@@ -16,6 +16,7 @@
     has_gene_product,
     orthologous,
 )
+from pyobo.struct.typedef import exact_match
 from pyobo.utils.io import multidict, multisetdict
 from pyobo.utils.path import ensure_df
 
@@ -40,7 +41,7 @@ class ZFINGetter(Obo):
     """An ontology representation of ZFIN's zebrafish database."""
 
     bioversions_key = ontology = PREFIX
-    typedefs = [from_species, has_gene_product, orthologous]
+    typedefs = [from_species, has_gene_product, orthologous, exact_match]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in ZFIN."""

diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py
@@ -158,6 +158,11 @@ def from_text(
 DEFAULT_SYNONYM_TYPE = SynonymTypeDef(
     reference=Reference(prefix="oboInOwl", identifier="SynonymType", name="Synonym"),
 )
+abbreviation = SynonymTypeDef(
+    reference=Reference(prefix="OMO", identifier="0003000", name="abbreviation")
+)
+acronym = SynonymTypeDef(reference=Reference(prefix="omo", identifier="0003012", name="acronym"))
+
 
 ReferenceHint = Union[Reference, "Term", Tuple[str, str], str]
 
@@ -425,8 +430,13 @@ def iterate_obo_lines(self, *, ontology, typedefs) -> Iterable[str]:
             yield f"{parent_tag}: {parent}"
 
         for typedef, references in sorted(self.relationships.items(), key=_sort_relations):
-            if typedef not in typedefs:
-                logger.warning(f"[{ontology}] typedef not defined in OBO: {typedef}")
+            if (not typedefs or typedef not in typedefs) and (
+                ontology,
+                typedef.curie,
+            ) not in _TYPEDEF_WARNINGS:
+                logger.warning(f"[{ontology}] typedef not defined in OBO: {typedef.curie}")
+                _TYPEDEF_WARNINGS.add((ontology, typedef.curie))
+
             typedef_preferred_curie = typedef.preferred_curie
             for reference in sorted(references, key=attrgetter("prefix", "identifier")):
                 s = f"relationship: {typedef_preferred_curie} {reference.preferred_curie}"
@@ -450,6 +460,10 @@ def _escape(s) -> str:
         return s.replace("\n", "\\n").replace('"', '\\"')
 
 
+#: A set of warnings, used to make sure we don't show the same one over and over
+_TYPEDEF_WARNINGS: Set[Tuple[str, str]] = set()
+
+
 def _sort_relations(r):
     typedef, _references = r
     return typedef.reference.name or typedef.reference.identifier
@@ -665,6 +679,7 @@ def iterate_obo_lines(self) -> Iterable[str]:
             yield f'property_value: http://purl.org/dc/terms/license "{license_spdx_id}" xsd:string'
         description = bioregistry.get_description(self.ontology)
         if description:
+            description = obo_escape_slim(description.strip())
             yield f'property_value: http://purl.org/dc/elements/1.1/description "{description}" xsd:string'
 
         for root_term in self.root_terms or []:

diff --git a/src/pyobo/struct/typedef.py b/src/pyobo/struct/typedef.py
@@ -40,6 +40,9 @@
     "enables",
     "participates_in",
     "has_participant",
+    "exact_match",
+    "has_dbxref",
+    # Properties
     "has_inchi",
     "has_smiles",
 ]
@@ -266,6 +269,7 @@ def get_reference_tuple(relation: RelationHint) -> Tuple[str, str]:
 has_ontology_root_term = TypeDef.from_triple(
     prefix=IAO_PREFIX, identifier="0000700", name="has ontology root term"
 )
+has_dbxref = TypeDef.from_curie("oboInOwl:hasDbXref", name="has database cross-reference")
 
 editor_note = Reference(prefix=IAO_PREFIX, identifier="0000116", name="editor note")