Skip to content

Commit

Permalink
Add NLM Catalog and fix default prefix on object properties and annot…
Browse files Browse the repository at this point in the history
…ation properties (#263)
  • Loading branch information
cthoyt authored Dec 4, 2024
1 parent a84c6d4 commit 7d8b1ca
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/pyobo/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from .mirbase_mature import MiRBaseMatureGetter
from .msigdb import MSigDBGetter
from .ncbigene import NCBIGeneGetter
from .nlm_catalog import NLMCatalogGetter
from .npass import NPASSGetter
from .omim_ps import OMIMPSGetter
from .pathbank import PathBankGetter
Expand Down Expand Up @@ -101,6 +102,7 @@
"MiRBaseGetter",
"MiRBaseMatureGetter",
"NCBIGeneGetter",
"NLMCatalogGetter",
"NPASSGetter",
"OMIMPSGetter",
"PIDGetter",
Expand Down
82 changes: 82 additions & 0 deletions src/pyobo/sources/nlm_catalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Converter for NLM Providers."""

from collections.abc import Iterable
from xml.etree import ElementTree

from pyobo.struct import Obo, Reference, Term, TypeDef, default_reference
from pyobo.utils.path import ensure_df, ensure_path

__all__ = [
"NLMCatalogGetter",
]

PREFIX = "nlm"
CATALOG_TO_PUBLISHER = "https://ftp.ncbi.nlm.nih.gov/pubmed/xmlprovidernames.txt"
JOURNAL_INFO_PATH = "https://ftp.ncbi.nlm.nih.gov/pubmed/jourcache.xml"
PUBLISHER = TypeDef.default(PREFIX, "has_publisher", name="has publisher")
START_YEAR = TypeDef.default(PREFIX, "has_start_year", name="has start year")
END_YEAR = TypeDef.default(PREFIX, "has_end_year", name="has end year")


# TODO enrich with context from https://ftp.ncbi.nlm.nih.gov/pubmed/J_Entrez.txt and https://ftp.ncbi.nlm.nih.gov/pubmed/J_Medline.txt


class NLMCatalogGetter(Obo):
"""An ontology representation of NLM Providers."""

bioversions_key = ontology = PREFIX
dynamic_version = True
typedefs = [PUBLISHER, START_YEAR, END_YEAR]
idspaces = {
PREFIX: "https://www.ncbi.nlm.nih.gov/nlmcatalog/",
}

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over gene terms for NLM Catalog."""
yield from get_terms()


def get_terms(force: bool = False) -> Iterable[Term]:
"""Get NLM Catalog terms."""
path = ensure_path(PREFIX, url=JOURNAL_INFO_PATH)
root = ElementTree.parse(path).getroot()

journal_to_publisher_df = ensure_df(
PREFIX, url=CATALOG_TO_PUBLISHER, sep="|", force=force, dtype=str
)
journal_id_to_publisher_key: dict[str, Reference] = {
# TODO change to external prefix later
journal_id: default_reference(PREFIX, key, name)
for journal_id, key, name in journal_to_publisher_df.values
}
for element in root.findall("Journal"):
yield _process_journal(element, journal_id_to_publisher_key)
for k in sorted(set(journal_id_to_publisher_key.values())):
yield Term(reference=k)


def _process_journal(element, journal_id_to_publisher_key: dict[str, Reference]) -> Term:
nlm_id = element.findtext("NlmUniqueID")
name = element.findtext("Name")
issns = [(issn.text, issn.attrib["type"]) for issn in element.findall("Issn")]
# ActivityFlag is either "0" or "1"
term = Term(
reference=Reference(prefix=PREFIX, identifier=nlm_id, name=name),
)
for synonym in element.findall("Alias"):
term.append_synonym(synonym.text)
for issn, _issn_type in issns:
# TODO include ISSN type, this is important
# to determine a "canonical" one
term.append_xref(Reference(prefix="issn", identifier=issn))
if start_year := element.findtext("StartYear"):
term.annotate_integer(START_YEAR, start_year)
if end_year := element.findtext("EndYear"):
term.annotate_integer(END_YEAR, end_year)
if publisher_reference := journal_id_to_publisher_key.get(term.identifier):
term.annotate_object(PUBLISHER, publisher_reference)
return term


if __name__ == "__main__":
NLMCatalogGetter().cli()
8 changes: 6 additions & 2 deletions src/pyobo/struct/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,10 @@ def annotate_boolean(self, prop: ReferenceHint, value: bool) -> Self:
prop, str(value).lower(), Reference(prefix="xsd", identifier="boolean")
)

def annotate_integer(self, prop: ReferenceHint, value: str) -> Self:
"""Append an object annotation."""
return self.annotate_literal(prop, value, Reference(prefix="xsd", identifier="integer"))

def _definition_fp(self) -> str:
definition = obo_escape_slim(self.definition) if self.definition else ""
return f'"{definition}" [{comma_separate_references(self.provenance)}]'
Expand Down Expand Up @@ -570,7 +574,7 @@ def _emit_relations(
for typedef, reference in self.iterate_relations():
_typedef_warn(prefix=ontology_prefix, predicate=typedef, typedefs=typedefs)
predicate_reference = self._reference(typedef, ontology_prefix)
s = f"relationship: {predicate_reference} {reference.preferred_curie}"
s = f"relationship: {predicate_reference} {self._reference(reference, ontology_prefix)}"
if typedef.name or reference.name:
s += " !"
if typedef.name:
Expand All @@ -592,7 +596,7 @@ def _emit_object_properties(
_typedef_warn(prefix=ontology_prefix, predicate=predicate, typedefs=typedefs)
predicate_curie = self._reference(predicate, ontology_prefix)
for value in sorted(values):
yv = f"{predicate_curie} {value.preferred_curie}"
yv = f"{predicate_curie} {self._reference(value, ontology_prefix)}"
if predicate.name and value.name:
yv += f" ! {predicate.name} {value.name}"
yield yv
Expand Down

0 comments on commit 7d8b1ca

Please sign in to comment.