Skip to content

Commit

Permalink
Merge branch 'main' into update-sssom-exporter
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Nov 23, 2024
2 parents 0a67a01 + 38bd6ad commit 854f8a3
Show file tree
Hide file tree
Showing 17 changed files with 351 additions and 170 deletions.
24 changes: 15 additions & 9 deletions src/pyobo/api/hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from .properties import get_filtered_properties_mapping
from .relations import get_filtered_relations_df
from ..identifier_utils import wrap_norm_prefix
from ..struct import TypeDef, has_member, is_a, part_of
from ..struct import has_member, is_a, part_of
from ..struct.reference import Reference
from ..struct.struct import ReferenceHint, _ensure_ref

__all__ = [
"get_ancestors",
Expand All @@ -32,8 +33,8 @@ def get_hierarchy(
*,
include_part_of: bool = True,
include_has_member: bool = False,
extra_relations: Iterable[TypeDef] | None = None,
properties: Iterable[str] | None = None,
extra_relations: Iterable[ReferenceHint] | None = None,
properties: Iterable[ReferenceHint] | None = None,
use_tqdm: bool = False,
force: bool = False,
force_process: bool = False,
Expand All @@ -58,12 +59,19 @@ def get_hierarchy(
This function thinly wraps :func:`_get_hierarchy_helper` to make it easier to work with the lru_cache mechanism.
"""
extra_relations_ = tuple(
sorted(_ensure_ref(r, ontology_prefix=prefix) for r in extra_relations or [])
)
properties_ = tuple(
sorted(_ensure_ref(prop, ontology_prefix=prefix) for prop in properties or [])
)

return _get_hierarchy_helper(
prefix=prefix,
include_part_of=include_part_of,
include_has_member=include_has_member,
extra_relations=tuple(sorted(extra_relations or [])),
properties=tuple(sorted(properties or [])),
extra_relations=extra_relations_,
properties=properties_,
use_tqdm=use_tqdm,
force=force,
force_process=force_process,
Expand All @@ -77,8 +85,8 @@ def get_hierarchy(
def _get_hierarchy_helper(
prefix: str,
*,
extra_relations: tuple[TypeDef, ...],
properties: tuple[str, ...],
extra_relations: tuple[Reference, ...],
properties: tuple[Reference, ...],
include_part_of: bool,
include_has_member: bool,
use_tqdm: bool,
Expand Down Expand Up @@ -140,8 +148,6 @@ def _get_hierarchy_helper(
rv.add_edge(f"{source_ns}:{source_id}", f"{prefix}:{target_id}", relation="part_of")

for relation in extra_relations:
if not isinstance(relation, TypeDef | Reference):
raise TypeError
relation_df = get_filtered_relations_df(
prefix=prefix,
relation=relation,
Expand Down
33 changes: 20 additions & 13 deletions src/pyobo/api/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ..constants import GetOntologyKwargs, check_should_force
from ..getters import get_ontology
from ..identifier_utils import wrap_norm_prefix
from ..struct.struct import ReferenceHint, _ensure_ref
from ..utils.cache import cached_df, cached_mapping, cached_multidict
from ..utils.io import multidict
from ..utils.path import prefix_cache_join
Expand Down Expand Up @@ -49,7 +50,7 @@ def _df_getter() -> pd.DataFrame:

@wrap_norm_prefix
def get_filtered_properties_mapping(
prefix: str, prop: str, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
prefix: str, prop: ReferenceHint, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
) -> Mapping[str, str]:
"""Extract a single property for each term as a dictionary.
Expand All @@ -59,22 +60,23 @@ def get_filtered_properties_mapping(
:param force: should the resource be re-downloaded, re-parsed, and re-cached?
:returns: A mapping from identifier to property value
"""
# df = get_properties_df(prefix=prefix, force=force, version=version)
# df = df[df["property"] == prop]
# return dict(df[[f"{prefix}_id", "value"]].values)

prop = _ensure_ref(prop)
version = get_version_from_kwargs(prefix, kwargs)
all_properties_path = prefix_cache_join(prefix, name="properties.tsv", version=version)
if all_properties_path.is_file():
logger.info("[%s] loading pre-cached properties", prefix)
df = pd.read_csv(all_properties_path, sep="\t")
logger.info("[%s] filtering pre-cached properties", prefix)
df = df.loc[df["property"] == prop, [f"{prefix}_id", "value"]]
df = df.loc[df["property"] == prop.preferred_curie, [f"{prefix}_id", "value"]]
return dict(df.values)

path = prefix_cache_join(prefix, "properties", name=f"{prop}.tsv", version=version)
path = prefix_cache_join(
prefix, "properties", name=f"{prop.preferred_curie}.tsv", version=version
)

@cached_mapping(path=path, header=[f"{prefix}_id", prop], force=check_should_force(kwargs))
@cached_mapping(
path=path, header=[f"{prefix}_id", prop.preferred_curie], force=check_should_force(kwargs)
)
def _mapping_getter() -> Mapping[str, str]:
logger.info("[%s] no cached properties found. getting from OBO loader", prefix)
ontology = get_ontology(prefix, **kwargs)
Expand All @@ -85,7 +87,7 @@ def _mapping_getter() -> Mapping[str, str]:

@wrap_norm_prefix
def get_filtered_properties_multimapping(
prefix: str, prop: str, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
prefix: str, prop: ReferenceHint, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
) -> Mapping[str, list[str]]:
"""Extract multiple properties for each term as a dictionary.
Expand All @@ -95,18 +97,23 @@ def get_filtered_properties_multimapping(
:param force: should the resource be re-downloaded, re-parsed, and re-cached?
:returns: A mapping from identifier to property values
"""
prop = _ensure_ref(prop)
version = get_version_from_kwargs(prefix, kwargs)
all_properties_path = prefix_cache_join(prefix, name="properties.tsv", version=version)
if all_properties_path.is_file():
logger.info("[%s] loading pre-cached properties", prefix)
df = pd.read_csv(all_properties_path, sep="\t")
logger.info("[%s] filtering pre-cached properties", prefix)
df = df.loc[df["property"] == prop, [f"{prefix}_id", "value"]]
df = df.loc[df["property"] == prop.preferred_curie, [f"{prefix}_id", "value"]]
return multidict(df.values)

path = prefix_cache_join(prefix, "properties", name=f"{prop}.tsv", version=version)
path = prefix_cache_join(
prefix, "properties", name=f"{prop.preferred_curie}.tsv", version=version
)

@cached_multidict(path=path, header=[f"{prefix}_id", prop], force=check_should_force(kwargs))
@cached_multidict(
path=path, header=[f"{prefix}_id", prop.preferred_curie], force=check_should_force(kwargs)
)
def _mapping_getter() -> Mapping[str, list[str]]:
logger.info("[%s] no cached properties found. getting from OBO loader", prefix)
ontology = get_ontology(prefix, **kwargs)
Expand All @@ -116,7 +123,7 @@ def _mapping_getter() -> Mapping[str, list[str]]:


def get_property(
prefix: str, identifier: str, prop: str, **kwargs: Unpack[GetOntologyKwargs]
prefix: str, identifier: str, prop: ReferenceHint, **kwargs: Unpack[GetOntologyKwargs]
) -> str | None:
"""Extract a single property for the given entity.
Expand Down
7 changes: 5 additions & 2 deletions src/pyobo/api/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
)
from ..getters import get_ontology
from ..identifier_utils import wrap_norm_prefix
from ..struct import TypeDef
from ..struct.reference import Reference
from ..struct.struct import ReferenceHint, _ensure_ref
from ..utils.cache import cached_df
Expand Down Expand Up @@ -102,7 +101,11 @@ def _df_getter() -> pd.DataFrame:

@wrap_norm_prefix
def get_id_multirelations_mapping(
prefix: str, typedef: TypeDef, *, use_tqdm: bool = False, **kwargs: Unpack[GetOntologyKwargs]
prefix: str,
typedef: ReferenceHint,
*,
use_tqdm: bool = False,
**kwargs: Unpack[GetOntologyKwargs],
) -> Mapping[str, list[Reference]]:
"""Get the OBO file and output a synonym dictionary."""
kwargs["version"] = get_version_from_kwargs(prefix, kwargs)
Expand Down
45 changes: 30 additions & 15 deletions src/pyobo/getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,20 @@
import logging
import pathlib
import subprocess
import time
import typing
import urllib.error
from collections import Counter
from collections.abc import Callable, Iterable, Mapping, Sequence
from pathlib import Path
from textwrap import indent
from typing import TypeVar

import bioregistry
import click
import pystow.utils
from bioontologies import robot
from tabulate import tabulate
from tqdm.auto import tqdm
from typing_extensions import Unpack

Expand All @@ -28,7 +31,7 @@
IterHelperHelperDict,
SlimGetOntologyKwargs,
)
from .identifier_utils import MissingPrefixError, wrap_norm_prefix
from .identifier_utils import ParseError, wrap_norm_prefix
from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
from .struct import Obo
from .utils.io import get_writer
Expand Down Expand Up @@ -230,9 +233,12 @@ def _ensure_ontology_path(
"atol": "unable to download",
"eol": "unable to download, same source as atol",
"hog": "unable to download",
"vhog": "unable to download",
"ccf": "unable to download",
"gorel": "unable to download",
"dinto": "unable to download",
"mo": "unable to download",
"vario": "unable to download/build",
"gainesville.core": "unable to download",
"mamo": "unable to download",
"ato": "can't process",
Expand Down Expand Up @@ -354,8 +360,8 @@ def iter_helper_helper(
logger.warning("[%s] unable to download - %s", prefix, e.reason)
if strict and not bioregistry.is_deprecated(prefix):
raise
except MissingPrefixError as e:
logger.warning("[%s] missing prefix: %s", prefix, e)
except ParseError as e:
logger.warning("[%s] CURIE/IRI parse error: %s", prefix, e)
if strict and not bioregistry.is_deprecated(prefix):
raise e
except RuntimeError as e:
Expand All @@ -364,7 +370,7 @@ def iter_helper_helper(
logger.warning("[drugbank] invalid credentials")
except subprocess.CalledProcessError:
logger.warning("[%s] ROBOT was unable to convert OWL to OBO", prefix)
except UnhandledFormatError as e:
except (UnhandledFormatError, NoBuildError) as e:
logger.warning("[%s] %s", prefix, e)
except ValueError as e:
if _is_xml(e):
Expand Down Expand Up @@ -424,6 +430,7 @@ def db_output_helper(
:param strict: Passed to ``f`` by keyword
:returns: A sequence of paths that got created.
"""
start = time.time()
directory = _prep_dir(directory)

c: typing.Counter[str] = Counter()
Expand All @@ -436,9 +443,17 @@ def db_output_helper(
db_sample_path = directory.joinpath(f"{db_name}_sample.tsv")
db_summary_path = directory.joinpath(f"{db_name}_summary.tsv")
db_summary_detailed_path = directory.joinpath(f"{db_name}_summary_detailed.tsv")
db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
rv: list[tuple[str, pathlib.Path]] = [
("Metadata", db_metadata_path),
("Data", db_path),
("Sample", db_sample_path),
("Summary", db_summary_path),
]

logger.info("writing %s to %s", db_name, db_path)
logger.info("writing %s sample to %s", db_name, db_sample_path)
sample_rows = []
with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file:
writer = get_writer(gzipped_file)

Expand All @@ -456,6 +471,7 @@ def db_output_helper(
c_detailed[tuple(row[i] for i in summary_detailed)] += 1
writer.writerow(row)
sample_writer.writerow(row)
sample_rows.append(row)

# continue just in the gzipped one
for row in it:
Expand All @@ -464,7 +480,6 @@ def db_output_helper(
c_detailed[tuple(row[i] for i in summary_detailed)] += 1
writer.writerow(row)

logger.info(f"writing {db_name} summary to {db_summary_path}")
with open(db_summary_path, "w") as file:
writer = get_writer(file)
writer.writerows(c.most_common())
Expand All @@ -474,8 +489,8 @@ def db_output_helper(
with open(db_summary_detailed_path, "w") as file:
writer = get_writer(file)
writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
rv.append(("Summary (Detailed)", db_summary_detailed_path))

db_metadata_path = directory.joinpath(f"{db_name}_metadata.json")
with open(db_metadata_path, "w") as file:
json.dump(
{
Expand All @@ -488,12 +503,12 @@ def db_output_helper(
indent=2,
)

rv: list[pathlib.Path] = [
db_metadata_path,
db_path,
db_sample_path,
db_summary_path,
]
if summary_detailed:
rv.append(db_summary_detailed_path)
return rv
elapsed = time.time() - start
click.secho(f"\nWrote the following files in {elapsed:.1f} seconds\n", fg="green")
click.secho(indent(tabulate(rv), " "), fg="green")

click.secho("\nSample rows:\n", fg="green")
click.secho(indent(tabulate(sample_rows, headers=columns), " "), fg="green")
click.echo()

return [path for _, path in rv]
Loading

0 comments on commit 854f8a3

Please sign in to comment.