Merge branch 'main' into update-sssom-exporter

biopragmatics · Nov 23, 2024 · 854f8a3 · 854f8a3
2 parents 0a67a01 + 38bd6ad
commit 854f8a3
Show file tree

Hide file tree

Showing 17 changed files with 351 additions and 170 deletions.
diff --git a/src/pyobo/api/hierarchy.py b/src/pyobo/api/hierarchy.py
@@ -10,8 +10,9 @@
 from .properties import get_filtered_properties_mapping
 from .relations import get_filtered_relations_df
 from ..identifier_utils import wrap_norm_prefix
-from ..struct import TypeDef, has_member, is_a, part_of
+from ..struct import has_member, is_a, part_of
 from ..struct.reference import Reference
+from ..struct.struct import ReferenceHint, _ensure_ref
 
 __all__ = [
     "get_ancestors",
@@ -32,8 +33,8 @@ def get_hierarchy(
     *,
     include_part_of: bool = True,
     include_has_member: bool = False,
-    extra_relations: Iterable[TypeDef] | None = None,
-    properties: Iterable[str] | None = None,
+    extra_relations: Iterable[ReferenceHint] | None = None,
+    properties: Iterable[ReferenceHint] | None = None,
     use_tqdm: bool = False,
     force: bool = False,
     force_process: bool = False,
@@ -58,12 +59,19 @@ def get_hierarchy(
 
     This function thinly wraps :func:`_get_hierarchy_helper` to make it easier to work with the lru_cache mechanism.
     """
+    extra_relations_ = tuple(
+        sorted(_ensure_ref(r, ontology_prefix=prefix) for r in extra_relations or [])
+    )
+    properties_ = tuple(
+        sorted(_ensure_ref(prop, ontology_prefix=prefix) for prop in properties or [])
+    )
+
     return _get_hierarchy_helper(
         prefix=prefix,
         include_part_of=include_part_of,
         include_has_member=include_has_member,
-        extra_relations=tuple(sorted(extra_relations or [])),
-        properties=tuple(sorted(properties or [])),
+        extra_relations=extra_relations_,
+        properties=properties_,
         use_tqdm=use_tqdm,
         force=force,
         force_process=force_process,
@@ -77,8 +85,8 @@ def get_hierarchy(
 def _get_hierarchy_helper(
     prefix: str,
     *,
-    extra_relations: tuple[TypeDef, ...],
-    properties: tuple[str, ...],
+    extra_relations: tuple[Reference, ...],
+    properties: tuple[Reference, ...],
     include_part_of: bool,
     include_has_member: bool,
     use_tqdm: bool,
@@ -140,8 +148,6 @@ def _get_hierarchy_helper(
             rv.add_edge(f"{source_ns}:{source_id}", f"{prefix}:{target_id}", relation="part_of")
 
     for relation in extra_relations:
-        if not isinstance(relation, TypeDef | Reference):
-            raise TypeError
         relation_df = get_filtered_relations_df(
             prefix=prefix,
             relation=relation,

diff --git a/src/pyobo/api/properties.py b/src/pyobo/api/properties.py
@@ -10,6 +10,7 @@
 from ..constants import GetOntologyKwargs, check_should_force
 from ..getters import get_ontology
 from ..identifier_utils import wrap_norm_prefix
+from ..struct.struct import ReferenceHint, _ensure_ref
 from ..utils.cache import cached_df, cached_mapping, cached_multidict
 from ..utils.io import multidict
 from ..utils.path import prefix_cache_join
@@ -49,7 +50,7 @@ def _df_getter() -> pd.DataFrame:
 
 @wrap_norm_prefix
 def get_filtered_properties_mapping(
-    prefix: str, prop: str, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
+    prefix: str, prop: ReferenceHint, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
 ) -> Mapping[str, str]:
     """Extract a single property for each term as a dictionary.
 
@@ -59,22 +60,23 @@ def get_filtered_properties_mapping(
     :param force: should the resource be re-downloaded, re-parsed, and re-cached?
     :returns: A mapping from identifier to property value
     """
-    # df = get_properties_df(prefix=prefix, force=force, version=version)
-    # df = df[df["property"] == prop]
-    # return dict(df[[f"{prefix}_id", "value"]].values)
-
+    prop = _ensure_ref(prop)
     version = get_version_from_kwargs(prefix, kwargs)
     all_properties_path = prefix_cache_join(prefix, name="properties.tsv", version=version)
     if all_properties_path.is_file():
         logger.info("[%s] loading pre-cached properties", prefix)
         df = pd.read_csv(all_properties_path, sep="\t")
         logger.info("[%s] filtering pre-cached properties", prefix)
-        df = df.loc[df["property"] == prop, [f"{prefix}_id", "value"]]
+        df = df.loc[df["property"] == prop.preferred_curie, [f"{prefix}_id", "value"]]
         return dict(df.values)
 
-    path = prefix_cache_join(prefix, "properties", name=f"{prop}.tsv", version=version)
+    path = prefix_cache_join(
+        prefix, "properties", name=f"{prop.preferred_curie}.tsv", version=version
+    )
 
-    @cached_mapping(path=path, header=[f"{prefix}_id", prop], force=check_should_force(kwargs))
+    @cached_mapping(
+        path=path, header=[f"{prefix}_id", prop.preferred_curie], force=check_should_force(kwargs)
+    )
     def _mapping_getter() -> Mapping[str, str]:
         logger.info("[%s] no cached properties found. getting from OBO loader", prefix)
         ontology = get_ontology(prefix, **kwargs)
@@ -85,7 +87,7 @@ def _mapping_getter() -> Mapping[str, str]:
 
 @wrap_norm_prefix
 def get_filtered_properties_multimapping(
-    prefix: str, prop: str, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
+    prefix: str, prop: ReferenceHint, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
 ) -> Mapping[str, list[str]]:
     """Extract multiple properties for each term as a dictionary.
 
@@ -95,18 +97,23 @@ def get_filtered_properties_multimapping(
     :param force: should the resource be re-downloaded, re-parsed, and re-cached?
     :returns: A mapping from identifier to property values
     """
+    prop = _ensure_ref(prop)
     version = get_version_from_kwargs(prefix, kwargs)
     all_properties_path = prefix_cache_join(prefix, name="properties.tsv", version=version)
     if all_properties_path.is_file():
         logger.info("[%s] loading pre-cached properties", prefix)
         df = pd.read_csv(all_properties_path, sep="\t")
         logger.info("[%s] filtering pre-cached properties", prefix)
-        df = df.loc[df["property"] == prop, [f"{prefix}_id", "value"]]
+        df = df.loc[df["property"] == prop.preferred_curie, [f"{prefix}_id", "value"]]
         return multidict(df.values)
 
-    path = prefix_cache_join(prefix, "properties", name=f"{prop}.tsv", version=version)
+    path = prefix_cache_join(
+        prefix, "properties", name=f"{prop.preferred_curie}.tsv", version=version
+    )
 
-    @cached_multidict(path=path, header=[f"{prefix}_id", prop], force=check_should_force(kwargs))
+    @cached_multidict(
+        path=path, header=[f"{prefix}_id", prop.preferred_curie], force=check_should_force(kwargs)
+    )
     def _mapping_getter() -> Mapping[str, list[str]]:
         logger.info("[%s] no cached properties found. getting from OBO loader", prefix)
         ontology = get_ontology(prefix, **kwargs)
@@ -116,7 +123,7 @@ def _mapping_getter() -> Mapping[str, list[str]]:
 
 
 def get_property(
-    prefix: str, identifier: str, prop: str, **kwargs: Unpack[GetOntologyKwargs]
+    prefix: str, identifier: str, prop: ReferenceHint, **kwargs: Unpack[GetOntologyKwargs]
 ) -> str | None:
     """Extract a single property for the given entity.
 

diff --git a/src/pyobo/api/relations.py b/src/pyobo/api/relations.py
@@ -22,7 +22,6 @@
 )
 from ..getters import get_ontology
 from ..identifier_utils import wrap_norm_prefix
-from ..struct import TypeDef
 from ..struct.reference import Reference
 from ..struct.struct import ReferenceHint, _ensure_ref
 from ..utils.cache import cached_df
@@ -102,7 +101,11 @@ def _df_getter() -> pd.DataFrame:
 
 @wrap_norm_prefix
 def get_id_multirelations_mapping(
-    prefix: str, typedef: TypeDef, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
+    prefix: str,
+    typedef: ReferenceHint,
+    *,
+    use_tqdm: bool = False,
+    **kwargs: Unpack[GetOntologyKwargs],
 ) -> Mapping[str, list[Reference]]:
     """Get the OBO file and output a synonym dictionary."""
     kwargs["version"] = get_version_from_kwargs(prefix, kwargs)

diff --git a/src/pyobo/getters.py b/src/pyobo/getters.py
@@ -8,17 +8,20 @@
 import logging
 import pathlib
 import subprocess
+import time
 import typing
 import urllib.error
 from collections import Counter
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from pathlib import Path
+from textwrap import indent
 from typing import TypeVar
 
 import bioregistry
 import click
 import pystow.utils
 from bioontologies import robot
+from tabulate import tabulate
 from tqdm.auto import tqdm
 from typing_extensions import Unpack
 
@@ -28,7 +31,7 @@
     IterHelperHelperDict,
     SlimGetOntologyKwargs,
 )
-from .identifier_utils import MissingPrefixError, wrap_norm_prefix
+from .identifier_utils import ParseError, wrap_norm_prefix
 from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
 from .struct import Obo
 from .utils.io import get_writer
@@ -230,9 +233,12 @@ def _ensure_ontology_path(
     "atol": "unable to download",
     "eol": "unable to download, same source as atol",
     "hog": "unable to download",
+    "vhog": "unable to download",
     "ccf": "unable to download",
     "gorel": "unable to download",
     "dinto": "unable to download",
+    "mo": "unable to download",
+    "vario": "unable to download/build",
     "gainesville.core": "unable to download",
     "mamo": "unable to download",
     "ato": "can't process",
@@ -354,8 +360,8 @@ def iter_helper_helper(
             logger.warning("[%s] unable to download - %s", prefix, e.reason)
             if strict and not bioregistry.is_deprecated(prefix):
                 raise
-        except MissingPrefixError as e:
-            logger.warning("[%s] missing prefix: %s", prefix, e)
+        except ParseError as e:
+            logger.warning("[%s] CURIE/IRI parse error: %s", prefix, e)
             if strict and not bioregistry.is_deprecated(prefix):
                 raise e
         except RuntimeError as e:
@@ -364,7 +370,7 @@ def iter_helper_helper(
             logger.warning("[drugbank] invalid credentials")
         except subprocess.CalledProcessError:
             logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix)
-        except UnhandledFormatError as e:
+        except (UnhandledFormatError, NoBuildError) as e:
             logger.warning("[%s] %s", prefix, e)
         except ValueError as e:
             if _is_xml(e):
@@ -424,6 +430,7 @@ def db_output_helper(
     :param strict: Passed to ``f`` by keyword
     :returns: A sequence of paths that got created.
     """
+    start = time.time()
     directory = _prep_dir(directory)
 
     c: typing.Counter[str] = Counter()
@@ -436,9 +443,17 @@ def db_output_helper(
     db_sample_path = directory.joinpath(f"{db_name}_sample.tsv")
     db_summary_path = directory.joinpath(f"{db_name}_summary.tsv")
     db_summary_detailed_path = directory.joinpath(f"{db_name}_summary_detailed.tsv")
+    db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
+    rv: list[tuple[str, pathlib.Path]] = [
+        ("Metadata", db_metadata_path),
+        ("Data", db_path),
+        ("Sample", db_sample_path),
+        ("Summary", db_summary_path),
+    ]
 
     logger.info("writing %s to %s", db_name, db_path)
     logger.info("writing %s sample to %s", db_name, db_sample_path)
+    sample_rows = []
     with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file:
         writer = get_writer(gzipped_file)
 
@@ -456,6 +471,7 @@ def db_output_helper(
                     c_detailed[tuple(row[i] for i in summary_detailed)] += 1
                 writer.writerow(row)
                 sample_writer.writerow(row)
+                sample_rows.append(row)
 
         # continue just in the gzipped one
         for row in it:
@@ -464,7 +480,6 @@ def db_output_helper(
                 c_detailed[tuple(row[i] for i in summary_detailed)] += 1
             writer.writerow(row)
 
-    logger.info(f"writing {db_name} summary to {db_summary_path}")
     with open(db_summary_path, "w") as file:
         writer = get_writer(file)
         writer.writerows(c.most_common())
@@ -474,8 +489,8 @@ def db_output_helper(
         with open(db_summary_detailed_path, "w") as file:
             writer = get_writer(file)
             writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
+        rv.append(("Summary (Detailed)", db_summary_detailed_path))
 
-    db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
     with open(db_metadata_path, "w") as file:
         json.dump(
             {
@@ -488,12 +503,12 @@ def db_output_helper(
             indent=2,
         )
 
-    rv: list[pathlib.Path] = [
-        db_metadata_path,
-        db_path,
-        db_sample_path,
-        db_summary_path,
-    ]
-    if summary_detailed:
-        rv.append(db_summary_detailed_path)
-    return rv
+    elapsed = time.time() - start
+    click.secho(f"\nWrote the following files in {elapsed:.1f} seconds\n", fg="green")
+    click.secho(indent(tabulate(rv), " "), fg="green")
+
+    click.secho("\nSample rows:\n", fg="green")
+    click.secho(indent(tabulate(sample_rows, headers=columns), " "), fg="green")
+    click.echo()
+
+    return [path for _, path in rv]