Merge branch 'master' into 618-bad-random-value

malariagen · Sep 23, 2024 · edebc2b · edebc2b
2 parents b8cd642 + 57c6d8a
commit edebc2b
Show file tree

Hide file tree

Showing 88 changed files with 854 additions and 12,328 deletions.
diff --git a/.github/workflows/legacy_tests.yml b/.github/workflows/legacy_tests.yml
@@ -53,7 +53,7 @@ jobs:
         uses: actions/cache/restore@v3
         with:
           path: gcs_cache
-          key: gcs_cache_tests_20240324
+          key: gcs_cache_tests_20240922
 
       - name: Run full test suite
         run: poetry run pytest --durations=20 --ignore=tests/anoph -v tests
@@ -63,4 +63,4 @@ jobs:
         if: always()
         with:
           path: gcs_cache
-          key: gcs_cache_tests_20240324
+          key: gcs_cache_tests_20240922
diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml
@@ -49,7 +49,7 @@ jobs:
         uses: actions/cache/restore@v3
         with:
           path: gcs_cache
-          key: gcs_cache_notebooks_20240324
+          key: gcs_cache_notebooks_20240922
 
       - name: Run notebooks
         run: poetry run jupyter nbconvert --execute notebooks/*.ipynb --inplace
@@ -59,4 +59,4 @@ jobs:
         if: always()
         with:
           path: gcs_cache
-          key: gcs_cache_notebooks_20240324
+          key: gcs_cache_notebooks_20240922
diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
@@ -61,7 +61,7 @@ class Af1(AnophelesDataResource):
     in a directory named "gcs_cache":
 
         >>> af1 = malariagen_data.Af1(
-        ...     "simplecache::gs://vo_afun_release",
+        ...     "simplecache::gs://vo_afun_release_master_us_central1",
         ...     simplecache=dict(cache_storage="gcs_cache"),
         ... )
 

diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
@@ -119,7 +119,7 @@ class Ag3(AnophelesDataResource):
     in a directory named "gcs_cache":
 
         >>> ag3 = malariagen_data.Ag3(
-        ...     "simplecache::gs://vo_agam_release",
+        ...     "simplecache::gs://vo_agam_release_master_us_central1",
         ...     simplecache=dict(cache_storage="gcs_cache"),
         ... )
 

diff --git a/malariagen_data/anoph/base_params.py b/malariagen_data/anoph/base_params.py
@@ -229,20 +229,28 @@ def validate_sample_selection_params(
 chunks: TypeAlias = Annotated[
     chunks_param_type,
     """
-    If 'auto' let dask decide chunk size. If 'native' use native zarr
-    chunks. If 'ndauto' let dask decide chunk size but only for arrays with
-    more than one dimension. If 'ndauto0' as 'ndauto' but only vary the first
-    chunk dimension. If 'ndauto1' as 'ndauto' but only vary the second chunk
-    dimension. If 'ndauto01' as 'ndauto' but only vary the first and second
-    chunk dimensions. Also, can be a target size, e.g., '200 MiB', or a tuple of
-    integers, or a callable which accepts the native chunks as a single argument
-    and returns a valid dask chunks value.
+    Define how input data being read from zarr should be divided into chunks
+    for a dask computation. If 'native', use underlying zarr chunks. If a string
+    specifying a target memory size, e.g., '300 MiB', resize chunks in arrays
+    with more than one dimension to match this size. If 'auto', let dask decide
+    chunk size.  If 'ndauto', let dask decide chunk size but only for arrays with
+    more than one dimension. If 'ndauto0', as 'ndauto' but only vary the first
+    chunk dimension. If 'ndauto1', as 'ndauto' but only vary the second chunk
+    dimension. If 'ndauto01', as 'ndauto' but only vary the first and second
+    chunk dimensions. Also, can be a tuple of integers, or a callable which
+    accepts the native chunks as a single argument and returns a valid dask
+    chunks value.
     """,
 ]
 
-# The "ndauto0" value means auto-size chunks for arrays with more than one dimension,
-# allowing the first chunk dimension to be varied.
-chunks_default: chunks = "ndauto0"
+# Match the native zarr chunk sizes by default. N.B., some functions may
+# choose a different default, especially if they need to retrieve larger
+# amounts of data.
+native_chunks: chunks = "native"
+
+# Alternative default chunk size, suitable for functions which need to
+# scan a large amount of data.
+large_chunks: chunks = "300MiB"
 
 gff_attributes: TypeAlias = Annotated[
     Optional[Union[Sequence[str], str]],

diff --git a/malariagen_data/anoph/cnv_data.py b/malariagen_data/anoph/cnv_data.py
@@ -179,7 +179,7 @@ def cnv_hmm(
         sample_query: Optional[base_params.sample_query] = None,
         max_coverage_variance: cnv_params.max_coverage_variance = cnv_params.max_coverage_variance_default,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.native_chunks,
     ) -> xr.Dataset:
         debug = self._log.debug
 
@@ -381,7 +381,7 @@ def cnv_coverage_calls(
         sample_set: base_params.sample_set,
         analysis: cnv_params.coverage_calls_analysis,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.native_chunks,
     ) -> xr.Dataset:
         debug = self._log.debug
 
@@ -537,7 +537,7 @@ def cnv_discordant_read_calls(
         sample_sets: Optional[base_params.sample_sets] = None,
         sample_query: Optional[base_params.sample_query] = None,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.native_chunks,
     ) -> xr.Dataset:
         debug = self._log.debug
 

diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py
@@ -73,6 +73,8 @@ def plot_diplotype_clustering(
         color_discrete_map: plotly_params.color_discrete_map = None,
         category_orders: plotly_params.category_order = None,
         legend_sizing: plotly_params.legend_sizing = "constant",
+        chunks: base_params.chunks = base_params.native_chunks,
+        inline_array: base_params.inline_array = base_params.inline_array_default,
     ) -> Optional[dict]:
         import sys
 
@@ -98,6 +100,8 @@ def plot_diplotype_clustering(
             cohort_size=cohort_size,
             distance_metric=distance_metric,
             random_seed=random_seed,
+            chunks=chunks,
+            inline_array=inline_array,
         )
 
         # Align sample metadata with genotypes.
@@ -196,6 +200,8 @@ def diplotype_pairwise_distances(
         cohort_size: Optional[base_params.cohort_size] = None,
         distance_metric: dipclust_params.distance_metric = dipclust_params.distance_metric_default,
         random_seed: base_params.random_seed = 42,
+        chunks: base_params.chunks = base_params.native_chunks,
+        inline_array: base_params.inline_array = base_params.inline_array_default,
     ) -> Tuple[np.ndarray, np.ndarray, int]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
@@ -220,7 +226,9 @@ def diplotype_pairwise_distances(
             results = self.results_cache_get(name=name, params=params)
 
         except CacheMiss:
-            results = self._diplotype_pairwise_distances(**params)
+            results = self._diplotype_pairwise_distances(
+                chunks=chunks, inline_array=inline_array, **params
+            )
             self.results_cache_set(name=name, params=params, results=results)
 
         # Unpack results")
@@ -241,6 +249,8 @@ def _diplotype_pairwise_distances(
         cohort_size,
         distance_metric,
         random_seed,
+        chunks,
+        inline_array,
     ):
         if distance_metric == "cityblock":
             metric = multiallelic_diplotype_mean_cityblock
@@ -256,6 +266,8 @@ def _diplotype_pairwise_distances(
             site_class=site_class,
             cohort_size=cohort_size,
             random_seed=random_seed,
+            chunks=chunks,
+            inline_array=inline_array,
         )
 
         with self._dask_progress(desc="Load genotypes for distance calculation"):
@@ -302,6 +314,8 @@ def _dipclust_het_bar_trace(
         cohort_size: Optional[base_params.cohort_size],
         random_seed: base_params.random_seed,
         color_continuous_scale: Optional[plotly_params.color_continuous_scale],
+        chunks: base_params.chunks = base_params.native_chunks,
+        inline_array: base_params.inline_array = base_params.inline_array_default,
     ):
         ds_snps = self.snp_calls(
             region=region,
@@ -310,6 +324,8 @@ def _dipclust_het_bar_trace(
             cohort_size=cohort_size,
             site_mask=site_mask,
             random_seed=random_seed,
+            chunks=chunks,
+            inline_array=inline_array,
         )
 
         # Strictly speaking we are loading the genotypes for the second time here,
@@ -361,6 +377,8 @@ def _dipclust_cnv_bar_trace(
         sample_query: Optional[base_params.sample_query],
         max_coverage_variance: Optional[cnv_params.max_coverage_variance],
         colorscale: Optional[plotly_params.color_continuous_scale],
+        chunks: base_params.chunks = base_params.native_chunks,
+        inline_array: base_params.inline_array = base_params.inline_array_default,
     ):
         try:
             # TODO The gene_cnv() method still needs to get migrated to the
@@ -372,6 +390,8 @@ def _dipclust_cnv_bar_trace(
                 sample_sets=sample_sets,
                 sample_query=sample_query,
                 max_coverage_variance=max_coverage_variance,
+                chunks=chunks,
+                inline_array=inline_array,
             )
 
         except ValueError:
@@ -422,6 +442,8 @@ def _dipclust_snp_trace(
         dendro_sample_id_order: np.ndarray,
         snp_filter_min_maf: float,
         snp_colorscale: Optional[plotly_params.color_continuous_scale],
+        chunks: base_params.chunks = base_params.native_chunks,
+        inline_array: base_params.inline_array = base_params.inline_array_default,
     ):
         # load genotype allele counts at SNP variants for each sample
         df_snps = self.snp_genotype_allele_counts(
@@ -430,6 +452,8 @@ def _dipclust_snp_trace(
             sample_query=sample_query,
             sample_sets=sample_sets,
             site_mask=site_mask,
+            chunks=chunks,
+            inline_array=inline_array,
         )
         df_snps = df_snps.set_index("label")
 
@@ -557,6 +581,8 @@ def plot_diplotype_clustering_advanced(
         color_discrete_map: plotly_params.color_discrete_map = None,
         category_orders: plotly_params.category_order = None,
         legend_sizing: plotly_params.legend_sizing = "constant",
+        chunks: base_params.chunks = base_params.native_chunks,
+        inline_array: base_params.inline_array = base_params.inline_array_default,
     ):
         if cohort_size and snp_transcript:
             cohort_size = None
@@ -592,6 +618,8 @@ def plot_diplotype_clustering_advanced(
             category_orders=category_orders,
             legend_sizing=legend_sizing,
             random_seed=random_seed,
+            chunks=chunks,
+            inline_array=inline_array,
         )
 
         fig_dendro = res["figure"]
@@ -611,6 +639,8 @@ def plot_diplotype_clustering_advanced(
                 site_mask=site_mask,
                 color_continuous_scale=heterozygosity_colorscale,
                 random_seed=random_seed,
+                chunks=chunks,
+                inline_array=inline_array,
             )
             figures.append(het_trace)
             subplot_heights.append(heterozygosity_height)
@@ -623,6 +653,8 @@ def plot_diplotype_clustering_advanced(
                 sample_query=sample_query,
                 max_coverage_variance=cnv_max_coverage_variance,
                 colorscale=cnv_colorscale,
+                chunks=chunks,
+                inline_array=inline_array,
             )
             # N.B., sometimes no CNV data may be available, so check to
             # see if the trace is not None.
@@ -640,6 +672,8 @@ def plot_diplotype_clustering_advanced(
                 dendro_sample_id_order=dendro_sample_id_order,
                 snp_filter_min_maf=snp_filter_min_maf,
                 snp_colorscale=snp_colorscale,
+                chunks=chunks,
+                inline_array=inline_array,
             )
 
             if snp_trace:

diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py
@@ -115,7 +115,7 @@ def fst_gwss(
         ] = fst_params.max_cohort_size_default,
         random_seed: base_params.random_seed = 42,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.large_chunks,
         clip_min: fst_params.clip_min = 0.0,
     ) -> Tuple[np.ndarray, np.ndarray]:
         # Change this name if you ever change the behaviour of this function, to

diff --git a/malariagen_data/anoph/g123.py b/malariagen_data/anoph/g123.py
@@ -161,7 +161,7 @@ def g123_gwss(
         ] = g123_params.max_cohort_size_default,
         random_seed: base_params.random_seed = 42,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.large_chunks,
     ) -> Tuple[np.ndarray, np.ndarray]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
@@ -264,7 +264,7 @@ def g123_calibration(
         window_sizes: g123_params.window_sizes = g123_params.window_sizes_default,
         random_seed: base_params.random_seed = 42,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.large_chunks,
     ) -> Mapping[str, np.ndarray]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
@@ -323,7 +323,7 @@ def plot_g123_gwss_track(
         x_range: Optional[gplt_params.x_range] = None,
         output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.large_chunks,
     ) -> gplt_params.figure:
         # compute G123
         x, g123 = self.g123_gwss(
@@ -424,7 +424,7 @@ def plot_g123_gwss(
         show: gplt_params.show = True,
         output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.large_chunks,
     ) -> gplt_params.figure:
         # gwss track
         fig1 = self.plot_g123_gwss_track(
@@ -497,7 +497,7 @@ def plot_g123_calibration(
         title: Optional[gplt_params.title] = None,
         show: gplt_params.show = True,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.large_chunks,
     ) -> gplt_params.figure:
         # get g123 values
         calibration_runs = self.g123_calibration(

diff --git a/malariagen_data/anoph/genome_sequence.py b/malariagen_data/anoph/genome_sequence.py
@@ -109,7 +109,7 @@ def genome_sequence(
         self,
         region: base_params.region,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-        chunks: base_params.chunks = base_params.chunks_default,
+        chunks: base_params.chunks = base_params.native_chunks,
     ) -> da.Array:
         # Parse the region parameter into a Region object.
         resolved_region: Region = parse_single_region(self, region)