Skip to content

Commit

Permalink
More options for automatic sizing of chunks (#615)
Browse files Browse the repository at this point in the history
* ndauto

* typing

* typing

* fix typing

* check it works

* dunno
  • Loading branch information
alimanfoo authored Sep 20, 2024
1 parent 628f198 commit 4eccc6f
Show file tree
Hide file tree
Showing 5 changed files with 1,816 additions and 367 deletions.
18 changes: 13 additions & 5 deletions malariagen_data/anoph/base_params.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""General parameters common to many functions in the public API."""

from typing import Final, List, Mapping, Optional, Sequence, Tuple, Union, Callable
from typing import Final, List, Mapping, Optional, Sequence, Tuple, Union

from typing_extensions import Annotated, TypeAlias

Expand All @@ -9,6 +9,7 @@
region_param_type,
single_contig_param_type,
single_region_param_type,
chunks_param_type,
)

contig: TypeAlias = Annotated[
Expand Down Expand Up @@ -226,15 +227,22 @@ def validate_sample_selection_params(
inline_array_default: inline_array = True

chunks: TypeAlias = Annotated[
Union[str, Tuple[int, ...], Callable[[Tuple[int, ...]], Tuple[int, ...]]],
chunks_param_type,
"""
If 'auto' let dask decide chunk size. If 'native' use native zarr
chunks. Also, can be a target size, e.g., '200 MiB', or a tuple of
integers.
chunks. If 'ndauto' let dask decide chunk size but only for arrays with
more than one dimension. If 'ndauto0' as 'ndauto' but only vary the first
chunk dimension. If 'ndauto1' as 'ndauto' but only vary the second chunk
dimension. If 'ndauto01' as 'ndauto' but only vary the first and second
chunk dimensions. Also, can be a target size, e.g., '200 MiB', or a tuple of
integers, or a callable which accepts the native chunks as a single argument
and returns a valid dask chunks value.
""",
]

chunks_default: chunks = "native"
# The "ndauto0" value means auto-size chunks for arrays with more than one dimension,
# allowing the first chunk dimension to be varied.
chunks_default: chunks = "ndauto0"

gff_attributes: TypeAlias = Annotated[
Optional[Union[Sequence[str], str]],
Expand Down
49 changes: 45 additions & 4 deletions malariagen_data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,24 @@ class SiteClass(Enum):
INTRON_LAST = 10


zarr_chunks_type: TypeAlias = Tuple[int, ...]

dask_chunks_type: TypeAlias = Union[
int,
str,
Tuple[Union[int, str], ...],
]

chunks_param_type: TypeAlias = Union[
dask_chunks_type,
Callable[[zarr_chunks_type], dask_chunks_type],
]


def da_from_zarr(
z: zarr.core.Array,
inline_array: bool,
chunks: Union[
str, Tuple[int, ...], Callable[[Tuple[int, ...]], Tuple[int, ...]]
] = "auto",
chunks: chunks_param_type,
) -> da.Array:
"""Utility function for turning a zarr array into a dask array.
Expand All @@ -180,12 +192,41 @@ def da_from_zarr(
"""
if callable(chunks):
dask_chunks: Union[Tuple[int, ...], str] = chunks(z.chunks)
dask_chunks: dask_chunks_type = chunks(z.chunks)
elif chunks == "native" or z.dtype == object:
# N.B., dask does not support "auto" chunks for arrays with object dtype
dask_chunks = z.chunks

# Auto-size chunks but only for arrays with more than one dimension.
elif chunks == "ndauto":
if len(z.chunks) > 1:
# Auto-size all dimensions.
dask_chunks = "auto"
else:
dask_chunks = z.chunks
elif chunks == "ndauto0":
if len(z.chunks) > 1:
# Auto-size first dimension.
dask_chunks = ("auto",) + z.chunks[1:]
else:
dask_chunks = z.chunks
elif chunks == "ndauto1":
if len(z.chunks) > 1:
# Auto-size second dimension.
dask_chunks = (z.chunks[0], "auto") + z.chunks[2:]
else:
dask_chunks = z.chunks
elif chunks == "ndauto01":
if len(z.chunks) > 1:
# Auto-size first and second dimensions.
dask_chunks = ("auto", "auto") + z.chunks[2:]
else:
dask_chunks = z.chunks

else:
# Pass through argument as-is.
dask_chunks = chunks

kwargs = dict(
chunks=dask_chunks, fancy=False, lock=False, inline_array=inline_array
)
Expand Down
Loading

0 comments on commit 4eccc6f

Please sign in to comment.