Skip to content

Commit

Permalink
Merge branch 'master' into 618-bad-random-value
Browse files Browse the repository at this point in the history
  • Loading branch information
jonbrenas authored Sep 23, 2024
2 parents b8cd642 + 57c6d8a commit edebc2b
Show file tree
Hide file tree
Showing 88 changed files with 854 additions and 12,328 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/legacy_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
uses: actions/cache/restore@v3
with:
path: gcs_cache
key: gcs_cache_tests_20240324
key: gcs_cache_tests_20240922

- name: Run full test suite
run: poetry run pytest --durations=20 --ignore=tests/anoph -v tests
Expand All @@ -63,4 +63,4 @@ jobs:
if: always()
with:
path: gcs_cache
key: gcs_cache_tests_20240324
key: gcs_cache_tests_20240922
4 changes: 2 additions & 2 deletions .github/workflows/notebooks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
uses: actions/cache/restore@v3
with:
path: gcs_cache
key: gcs_cache_notebooks_20240324
key: gcs_cache_notebooks_20240922

- name: Run notebooks
run: poetry run jupyter nbconvert --execute notebooks/*.ipynb --inplace
Expand All @@ -59,4 +59,4 @@ jobs:
if: always()
with:
path: gcs_cache
key: gcs_cache_notebooks_20240324
key: gcs_cache_notebooks_20240922
2 changes: 1 addition & 1 deletion malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class Af1(AnophelesDataResource):
in a directory named "gcs_cache":
>>> af1 = malariagen_data.Af1(
... "simplecache::gs://vo_afun_release",
... "simplecache::gs://vo_afun_release_master_us_central1",
... simplecache=dict(cache_storage="gcs_cache"),
... )
Expand Down
2 changes: 1 addition & 1 deletion malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class Ag3(AnophelesDataResource):
in a directory named "gcs_cache":
>>> ag3 = malariagen_data.Ag3(
... "simplecache::gs://vo_agam_release",
... "simplecache::gs://vo_agam_release_master_us_central1",
... simplecache=dict(cache_storage="gcs_cache"),
... )
Expand Down
30 changes: 19 additions & 11 deletions malariagen_data/anoph/base_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,20 +229,28 @@ def validate_sample_selection_params(
chunks: TypeAlias = Annotated[
chunks_param_type,
"""
If 'auto' let dask decide chunk size. If 'native' use native zarr
chunks. If 'ndauto' let dask decide chunk size but only for arrays with
more than one dimension. If 'ndauto0' as 'ndauto' but only vary the first
chunk dimension. If 'ndauto1' as 'ndauto' but only vary the second chunk
dimension. If 'ndauto01' as 'ndauto' but only vary the first and second
chunk dimensions. Also, can be a target size, e.g., '200 MiB', or a tuple of
integers, or a callable which accepts the native chunks as a single argument
and returns a valid dask chunks value.
Define how input data being read from zarr should be divided into chunks
for a dask computation. If 'native', use underlying zarr chunks. If a string
specifying a target memory size, e.g., '300 MiB', resize chunks in arrays
with more than one dimension to match this size. If 'auto', let dask decide
chunk size. If 'ndauto', let dask decide chunk size but only for arrays with
more than one dimension. If 'ndauto0', as 'ndauto' but only vary the first
chunk dimension. If 'ndauto1', as 'ndauto' but only vary the second chunk
dimension. If 'ndauto01', as 'ndauto' but only vary the first and second
chunk dimensions. Also, can be a tuple of integers, or a callable which
accepts the native chunks as a single argument and returns a valid dask
chunks value.
""",
]

# The "ndauto0" value means auto-size chunks for arrays with more than one dimension,
# allowing the first chunk dimension to be varied.
chunks_default: chunks = "ndauto0"
# Match the native zarr chunk sizes by default. N.B., some functions may
# choose a different default, especially if they need to retrieve larger
# amounts of data.
native_chunks: chunks = "native"

# Alternative default chunk size, suitable for functions which need to
# scan a large amount of data.
large_chunks: chunks = "300MiB"

gff_attributes: TypeAlias = Annotated[
Optional[Union[Sequence[str], str]],
Expand Down
6 changes: 3 additions & 3 deletions malariagen_data/anoph/cnv_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def cnv_hmm(
sample_query: Optional[base_params.sample_query] = None,
max_coverage_variance: cnv_params.max_coverage_variance = cnv_params.max_coverage_variance_default,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.native_chunks,
) -> xr.Dataset:
debug = self._log.debug

Expand Down Expand Up @@ -381,7 +381,7 @@ def cnv_coverage_calls(
sample_set: base_params.sample_set,
analysis: cnv_params.coverage_calls_analysis,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.native_chunks,
) -> xr.Dataset:
debug = self._log.debug

Expand Down Expand Up @@ -537,7 +537,7 @@ def cnv_discordant_read_calls(
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.native_chunks,
) -> xr.Dataset:
debug = self._log.debug

Expand Down
36 changes: 35 additions & 1 deletion malariagen_data/anoph/dipclust.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ def plot_diplotype_clustering(
color_discrete_map: plotly_params.color_discrete_map = None,
category_orders: plotly_params.category_order = None,
legend_sizing: plotly_params.legend_sizing = "constant",
chunks: base_params.chunks = base_params.native_chunks,
inline_array: base_params.inline_array = base_params.inline_array_default,
) -> Optional[dict]:
import sys

Expand All @@ -98,6 +100,8 @@ def plot_diplotype_clustering(
cohort_size=cohort_size,
distance_metric=distance_metric,
random_seed=random_seed,
chunks=chunks,
inline_array=inline_array,
)

# Align sample metadata with genotypes.
Expand Down Expand Up @@ -196,6 +200,8 @@ def diplotype_pairwise_distances(
cohort_size: Optional[base_params.cohort_size] = None,
distance_metric: dipclust_params.distance_metric = dipclust_params.distance_metric_default,
random_seed: base_params.random_seed = 42,
chunks: base_params.chunks = base_params.native_chunks,
inline_array: base_params.inline_array = base_params.inline_array_default,
) -> Tuple[np.ndarray, np.ndarray, int]:
# Change this name if you ever change the behaviour of this function, to
# invalidate any previously cached data.
Expand All @@ -220,7 +226,9 @@ def diplotype_pairwise_distances(
results = self.results_cache_get(name=name, params=params)

except CacheMiss:
results = self._diplotype_pairwise_distances(**params)
results = self._diplotype_pairwise_distances(
chunks=chunks, inline_array=inline_array, **params
)
self.results_cache_set(name=name, params=params, results=results)

# Unpack results")
Expand All @@ -241,6 +249,8 @@ def _diplotype_pairwise_distances(
cohort_size,
distance_metric,
random_seed,
chunks,
inline_array,
):
if distance_metric == "cityblock":
metric = multiallelic_diplotype_mean_cityblock
Expand All @@ -256,6 +266,8 @@ def _diplotype_pairwise_distances(
site_class=site_class,
cohort_size=cohort_size,
random_seed=random_seed,
chunks=chunks,
inline_array=inline_array,
)

with self._dask_progress(desc="Load genotypes for distance calculation"):
Expand Down Expand Up @@ -302,6 +314,8 @@ def _dipclust_het_bar_trace(
cohort_size: Optional[base_params.cohort_size],
random_seed: base_params.random_seed,
color_continuous_scale: Optional[plotly_params.color_continuous_scale],
chunks: base_params.chunks = base_params.native_chunks,
inline_array: base_params.inline_array = base_params.inline_array_default,
):
ds_snps = self.snp_calls(
region=region,
Expand All @@ -310,6 +324,8 @@ def _dipclust_het_bar_trace(
cohort_size=cohort_size,
site_mask=site_mask,
random_seed=random_seed,
chunks=chunks,
inline_array=inline_array,
)

# Strictly speaking we are loading the genotypes for the second time here,
Expand Down Expand Up @@ -361,6 +377,8 @@ def _dipclust_cnv_bar_trace(
sample_query: Optional[base_params.sample_query],
max_coverage_variance: Optional[cnv_params.max_coverage_variance],
colorscale: Optional[plotly_params.color_continuous_scale],
chunks: base_params.chunks = base_params.native_chunks,
inline_array: base_params.inline_array = base_params.inline_array_default,
):
try:
# TODO The gene_cnv() method still needs to get migrated to the
Expand All @@ -372,6 +390,8 @@ def _dipclust_cnv_bar_trace(
sample_sets=sample_sets,
sample_query=sample_query,
max_coverage_variance=max_coverage_variance,
chunks=chunks,
inline_array=inline_array,
)

except ValueError:
Expand Down Expand Up @@ -422,6 +442,8 @@ def _dipclust_snp_trace(
dendro_sample_id_order: np.ndarray,
snp_filter_min_maf: float,
snp_colorscale: Optional[plotly_params.color_continuous_scale],
chunks: base_params.chunks = base_params.native_chunks,
inline_array: base_params.inline_array = base_params.inline_array_default,
):
# load genotype allele counts at SNP variants for each sample
df_snps = self.snp_genotype_allele_counts(
Expand All @@ -430,6 +452,8 @@ def _dipclust_snp_trace(
sample_query=sample_query,
sample_sets=sample_sets,
site_mask=site_mask,
chunks=chunks,
inline_array=inline_array,
)
df_snps = df_snps.set_index("label")

Expand Down Expand Up @@ -557,6 +581,8 @@ def plot_diplotype_clustering_advanced(
color_discrete_map: plotly_params.color_discrete_map = None,
category_orders: plotly_params.category_order = None,
legend_sizing: plotly_params.legend_sizing = "constant",
chunks: base_params.chunks = base_params.native_chunks,
inline_array: base_params.inline_array = base_params.inline_array_default,
):
if cohort_size and snp_transcript:
cohort_size = None
Expand Down Expand Up @@ -592,6 +618,8 @@ def plot_diplotype_clustering_advanced(
category_orders=category_orders,
legend_sizing=legend_sizing,
random_seed=random_seed,
chunks=chunks,
inline_array=inline_array,
)

fig_dendro = res["figure"]
Expand All @@ -611,6 +639,8 @@ def plot_diplotype_clustering_advanced(
site_mask=site_mask,
color_continuous_scale=heterozygosity_colorscale,
random_seed=random_seed,
chunks=chunks,
inline_array=inline_array,
)
figures.append(het_trace)
subplot_heights.append(heterozygosity_height)
Expand All @@ -623,6 +653,8 @@ def plot_diplotype_clustering_advanced(
sample_query=sample_query,
max_coverage_variance=cnv_max_coverage_variance,
colorscale=cnv_colorscale,
chunks=chunks,
inline_array=inline_array,
)
# N.B., sometimes no CNV data may be available, so check to
# see if the trace is not None.
Expand All @@ -640,6 +672,8 @@ def plot_diplotype_clustering_advanced(
dendro_sample_id_order=dendro_sample_id_order,
snp_filter_min_maf=snp_filter_min_maf,
snp_colorscale=snp_colorscale,
chunks=chunks,
inline_array=inline_array,
)

if snp_trace:
Expand Down
2 changes: 1 addition & 1 deletion malariagen_data/anoph/fst.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def fst_gwss(
] = fst_params.max_cohort_size_default,
random_seed: base_params.random_seed = 42,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.large_chunks,
clip_min: fst_params.clip_min = 0.0,
) -> Tuple[np.ndarray, np.ndarray]:
# Change this name if you ever change the behaviour of this function, to
Expand Down
10 changes: 5 additions & 5 deletions malariagen_data/anoph/g123.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def g123_gwss(
] = g123_params.max_cohort_size_default,
random_seed: base_params.random_seed = 42,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.large_chunks,
) -> Tuple[np.ndarray, np.ndarray]:
# Change this name if you ever change the behaviour of this function, to
# invalidate any previously cached data.
Expand Down Expand Up @@ -264,7 +264,7 @@ def g123_calibration(
window_sizes: g123_params.window_sizes = g123_params.window_sizes_default,
random_seed: base_params.random_seed = 42,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.large_chunks,
) -> Mapping[str, np.ndarray]:
# Change this name if you ever change the behaviour of this function, to
# invalidate any previously cached data.
Expand Down Expand Up @@ -323,7 +323,7 @@ def plot_g123_gwss_track(
x_range: Optional[gplt_params.x_range] = None,
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.large_chunks,
) -> gplt_params.figure:
# compute G123
x, g123 = self.g123_gwss(
Expand Down Expand Up @@ -424,7 +424,7 @@ def plot_g123_gwss(
show: gplt_params.show = True,
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.large_chunks,
) -> gplt_params.figure:
# gwss track
fig1 = self.plot_g123_gwss_track(
Expand Down Expand Up @@ -497,7 +497,7 @@ def plot_g123_calibration(
title: Optional[gplt_params.title] = None,
show: gplt_params.show = True,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.large_chunks,
) -> gplt_params.figure:
# get g123 values
calibration_runs = self.g123_calibration(
Expand Down
2 changes: 1 addition & 1 deletion malariagen_data/anoph/genome_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def genome_sequence(
self,
region: base_params.region,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
chunks: base_params.chunks = base_params.native_chunks,
) -> da.Array:
# Parse the region parameter into a Region object.
resolved_region: Region = parse_single_region(self, region)
Expand Down
Loading

0 comments on commit edebc2b

Please sign in to comment.