From d2a8a972a565540b1d19e34f47e9c11ff5c8cd98 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 23 Nov 2024 13:20:36 +0100 Subject: [PATCH 1/2] Update msigdb.py --- src/pyobo/sources/msigdb.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/pyobo/sources/msigdb.py b/src/pyobo/sources/msigdb.py index b6cdba7f..17416347 100644 --- a/src/pyobo/sources/msigdb.py +++ b/src/pyobo/sources/msigdb.py @@ -3,10 +3,10 @@ import logging from collections.abc import Iterable -from lxml.etree import ElementTree +from pystow.utils import read_zipfile_xml from tqdm.auto import tqdm -from ..struct import Obo, Reference, Term, has_participant +from ..struct import Obo, Reference, Term, TypeDef, has_participant from ..utils.path import ensure_path logger = logging.getLogger(__name__) @@ -18,6 +18,20 @@ PREFIX = "msigdb" BASE_URL = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release" +CATEGORY_CODE = TypeDef.default(PREFIX, "category_code", name="category code") +SUB_CATEGORY_CODE = TypeDef.default(PREFIX, "sub_category_code", name="sub-category code") +CONTRIBUTOR = TypeDef.default(PREFIX, "contributor", name="contributor") +EXACT_SOURCE = TypeDef.default(PREFIX, "exact_source", name="exact source") +EXTERNAL_DETAILS_URL = TypeDef.default(PREFIX, "external_details_url", name="external details URL") + +PROPERTIES = [ + ("CATEGORY_CODE", CATEGORY_CODE), + ("SUB_CATEGORY_CODE", SUB_CATEGORY_CODE), + ("CONTRIBUTOR", CONTRIBUTOR), + ("EXACT_SOURCE", EXACT_SOURCE), + ("EXTERNAL_DETAILS_URL", EXTERNAL_DETAILS_URL), +] + class MSigDBGetter(Obo): """An ontology representation of MMSigDB's gene set nomenclature.""" @@ -50,9 +64,10 @@ def get_obo(force: bool = False) -> Obo: def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Get MSigDb terms.""" - xml_url = f"{BASE_URL}/{version}.Hs/msigdb_v{version}.Hs.xml" + xml_url = f"{BASE_URL}/{version}.Hs/msigdb_v{version}.Hs.xml.zip" path = ensure_path(prefix=PREFIX, url=xml_url, version=version, force=force) - tree = ElementTree.parse(path) + inner_path = f"msigdb_v{version}.Hs.xml" + tree = read_zipfile_xml(path, inner_path=inner_path) for entry in tqdm(tree.getroot(), desc=f"{PREFIX} v{version}", unit_scale=True): attrib = dict(entry.attrib) @@ -79,16 +94,9 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]: provenance=[] if reference is None else [reference], is_obsolete=is_obsolete, ) - for key in [ - "CATEGORY_CODE", - "SUB_CATEGORY_CODE", - "CONTRIBUTOR", - "EXACT_SOURCE", - "EXTERNAL_DETAILS_URL", - ]: - value = attrib[key].strip() - if value: - term.annotate_literal(key.lower(), value) + for key, typedef in PROPERTIES: + if value := attrib[key].strip(): + term.annotate_literal(typedef, value) term.set_species(tax_id) From f392d7ed56c5070c3c4b535abd77f195028ea375 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 23 Nov 2024 14:48:21 +0100 Subject: [PATCH 2/2] Update msigdb.py --- src/pyobo/sources/msigdb.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/src/pyobo/sources/msigdb.py b/src/pyobo/sources/msigdb.py index 17416347..8678d8b3 100644 --- a/src/pyobo/sources/msigdb.py +++ b/src/pyobo/sources/msigdb.py @@ -1,9 +1,10 @@ """Parsers for MSig.""" import logging +import zipfile from collections.abc import Iterable -from pystow.utils import read_zipfile_xml +from lxml import etree from tqdm.auto import tqdm from ..struct import Obo, Reference, Term, TypeDef, has_participant @@ -62,14 +63,33 @@ def get_obo(force: bool = False) -> Obo: KEGG_URL_PREFIX = "http://www.genome.jp/kegg/pathway/hsa/" -def iter_terms(version: str, force: bool = False) -> Iterable[Term]: - """Get MSigDb terms.""" +def _iter_entries(version: str, force: bool = False): xml_url = f"{BASE_URL}/{version}.Hs/msigdb_v{version}.Hs.xml.zip" path = ensure_path(prefix=PREFIX, url=xml_url, version=version, force=force) - inner_path = f"msigdb_v{version}.Hs.xml" - tree = read_zipfile_xml(path, inner_path=inner_path) + with zipfile.ZipFile(path, "r") as zf: + with zf.open(f"msigdb_v{version}.Hs.xml") as file: + for _ in range(3): + next(file) + # from here on out, every row except the last is a GENESET + for i, line_bytes in enumerate(file, start=4): + line = line_bytes.decode("utf8").strip() + if not line.startswith(" Iterable[Term]: + """Get MSigDb terms.""" + entries = _iter_entries(version=version, force=force) + for entry in tqdm(entries, desc=f"{PREFIX} v{version}", unit_scale=True): attrib = dict(entry.attrib) tax_id = _SPECIES[attrib["ORGANISM"]] @@ -148,4 +168,4 @@ def _get_definition(attrib) -> str | None: if __name__ == "__main__": - MSigDBGetter.cli() + MSigDBGetter().write_default(force=True, write_obo=True)