From 4a47a79b70c8e4f33e4ba96f1354cb24d071d256 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 19 Jul 2024 11:31:39 +0100 Subject: [PATCH 01/12] Add defaults for min_minor_ac, max_missing_an --- malariagen_data/anoph/base_params.py | 4 ++++ malariagen_data/anoph/pca.py | 8 ++++++-- malariagen_data/anoph/snp_data.py | 16 ++++++++++++---- malariagen_data/anopheles.py | 16 ++++++++++++---- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/malariagen_data/anoph/base_params.py b/malariagen_data/anoph/base_params.py index 712cc4dcc..28044cea5 100644 --- a/malariagen_data/anoph/base_params.py +++ b/malariagen_data/anoph/base_params.py @@ -270,6 +270,8 @@ def validate_sample_selection_params( """, ] +min_minor_ac_default: min_minor_ac = 2 + max_missing_an: TypeAlias = Annotated[ int, """ @@ -279,6 +281,8 @@ def validate_sample_selection_params( """, ] +max_missing_an_default: max_missing_an = 0 + snp_query: TypeAlias = Annotated[ str, """ diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py index 6d819ba4e..15df8cd12 100644 --- a/malariagen_data/anoph/pca.py +++ b/malariagen_data/anoph/pca.py @@ -62,8 +62,12 @@ def pca( sample_indices: Optional[base_params.sample_indices] = None, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, site_class: Optional[base_params.site_class] = None, - min_minor_ac: Optional[base_params.min_minor_ac] = None, - max_missing_an: Optional[base_params.max_missing_an] = None, + min_minor_ac: Optional[ + base_params.min_minor_ac + ] = base_params.min_minor_ac_default, + max_missing_an: Optional[ + base_params.max_missing_an + ] = base_params.max_missing_an_default, cohort_size: Optional[base_params.cohort_size] = None, min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py index 8e16c3552..c3dbac725 100644 --- a/malariagen_data/anoph/snp_data.py +++ b/malariagen_data/anoph/snp_data.py @@ -1565,8 +1565,12 @@ def biallelic_snp_calls( min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, random_seed: base_params.random_seed = 42, - min_minor_ac: Optional[base_params.min_minor_ac] = None, - max_missing_an: Optional[base_params.max_missing_an] = None, + min_minor_ac: Optional[ + base_params.min_minor_ac + ] = base_params.min_minor_ac_default, + max_missing_an: Optional[ + base_params.max_missing_an + ] = base_params.max_missing_an_default, n_snps: Optional[base_params.n_snps] = None, thin_offset: base_params.thin_offset = 0, ) -> xr.Dataset: @@ -1709,8 +1713,12 @@ def biallelic_diplotypes( min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, random_seed: base_params.random_seed = 42, - min_minor_ac: Optional[base_params.min_minor_ac] = None, - max_missing_an: Optional[base_params.max_missing_an] = None, + min_minor_ac: Optional[ + base_params.min_minor_ac + ] = base_params.min_minor_ac_default, + max_missing_an: Optional[ + base_params.max_missing_an + ] = base_params.max_missing_an_default, n_snps: Optional[base_params.n_snps] = None, thin_offset: base_params.thin_offset = 0, inline_array: base_params.inline_array = base_params.inline_array_default, diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 615215858..eb84403dd 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -2978,8 +2978,12 @@ def biallelic_diplotype_pairwise_distances( sample_indices: Optional[base_params.sample_indices] = None, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, site_class: Optional[base_params.site_class] = None, - min_minor_ac: Optional[base_params.min_minor_ac] = None, - max_missing_an: Optional[base_params.max_missing_an] = None, + min_minor_ac: Optional[ + base_params.min_minor_ac + ] = base_params.min_minor_ac_default, + max_missing_an: Optional[ + base_params.max_missing_an + ] = base_params.max_missing_an_default, cohort_size: Optional[base_params.cohort_size] = None, min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, @@ -3158,8 +3162,12 @@ def plot_njt( sample_indices: Optional[base_params.sample_indices] = None, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, site_class: Optional[base_params.site_class] = None, - min_minor_ac: Optional[base_params.min_minor_ac] = None, - max_missing_an: Optional[base_params.max_missing_an] = None, + min_minor_ac: Optional[ + base_params.min_minor_ac + ] = base_params.min_minor_ac_default, + max_missing_an: Optional[ + base_params.max_missing_an + ] = base_params.max_missing_an_default, cohort_size: Optional[base_params.cohort_size] = None, min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, From 1108fd59f965e0d75b2cf93c384ecefa3f8b801e Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Fri, 19 Jul 2024 12:05:43 +0100 Subject: [PATCH 02/12] Use all SNPs in test_pca_plotting() when more samples than SNPs --- tests/anoph/test_pca.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/anoph/test_pca.py b/tests/anoph/test_pca.py index 554644f98..563df3184 100644 --- a/tests/anoph/test_pca.py +++ b/tests/anoph/test_pca.py @@ -88,9 +88,14 @@ def test_pca_plotting(fixture, api: AnophelesPca): # PCA parameters. n_samples = ds.sizes["samples"] n_snps_available = ds.sizes["variants"] - n_snps = random.randint(n_samples, n_snps_available) n_components = random.randint(3, n_samples) + # If there are more samples than SNPs available, use all SNPs. + if n_samples > n_snps_available: + n_snps = n_snps_available + else: + n_snps = random.randint(n_samples, n_snps_available) + # Run the PCA. pca_df, pca_evr = api.pca( n_snps=n_snps, From 94ee025a143800c5c6ed2a96367948df942ca867 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:36:02 +0100 Subject: [PATCH 03/12] Remove min_minor_ac_default, max_missing_an_default from base_params.py --- malariagen_data/anoph/base_params.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/malariagen_data/anoph/base_params.py b/malariagen_data/anoph/base_params.py index 28044cea5..712cc4dcc 100644 --- a/malariagen_data/anoph/base_params.py +++ b/malariagen_data/anoph/base_params.py @@ -270,8 +270,6 @@ def validate_sample_selection_params( """, ] -min_minor_ac_default: min_minor_ac = 2 - max_missing_an: TypeAlias = Annotated[ int, """ @@ -281,8 +279,6 @@ def validate_sample_selection_params( """, ] -max_missing_an_default: max_missing_an = 0 - snp_query: TypeAlias = Annotated[ str, """ From 71fa3b9989c923143fa747252aedf7dc98a3612c Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:38:00 +0100 Subject: [PATCH 04/12] Add min_minor_ac_default, max_missing_an_default to pca(). Bump cache name. --- malariagen_data/anoph/pca.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py index 15df8cd12..a5a28adeb 100644 --- a/malariagen_data/anoph/pca.py +++ b/malariagen_data/anoph/pca.py @@ -64,10 +64,10 @@ def pca( site_class: Optional[base_params.site_class] = None, min_minor_ac: Optional[ base_params.min_minor_ac - ] = base_params.min_minor_ac_default, + ] = pca_params.min_minor_ac_default, max_missing_an: Optional[ base_params.max_missing_an - ] = base_params.max_missing_an_default, + ] = pca_params.max_missing_an_default, cohort_size: Optional[base_params.cohort_size] = None, min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, @@ -77,7 +77,7 @@ def pca( ) -> Tuple[pca_params.df_pca, pca_params.evr]: # Change this name if you ever change the behaviour of this function, to # invalidate any previously cached data. - name = "pca_v2" + name = "pca_v3" # Normalize params for consistent hash value. ( From 7f5fe5f479d7d3d69f8d5351fa92d503c50e9d68 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:39:14 +0100 Subject: [PATCH 05/12] Add min_minor_ac_default, max_missing_an_default to pca_params.py --- malariagen_data/anoph/pca_params.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/malariagen_data/anoph/pca_params.py b/malariagen_data/anoph/pca_params.py index e74a243af..ef959509d 100644 --- a/malariagen_data/anoph/pca_params.py +++ b/malariagen_data/anoph/pca_params.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd from typing_extensions import Annotated, TypeAlias +from . import base_params n_components: TypeAlias = Annotated[ int, @@ -23,3 +24,7 @@ np.ndarray, "An array of explained variance ratios, one per component.", ] + +min_minor_ac_default: base_params.min_minor_ac = 2 + +max_missing_an_default: base_params.max_missing_an = 0 From 1808a7c7f5df04affb3618d6df4c5fb6772a8f5a Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:40:35 +0100 Subject: [PATCH 06/12] Remove min_minor_ac_default, max_missing_an_default from AnophelesSnpData --- malariagen_data/anoph/snp_data.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py index c3dbac725..8e16c3552 100644 --- a/malariagen_data/anoph/snp_data.py +++ b/malariagen_data/anoph/snp_data.py @@ -1565,12 +1565,8 @@ def biallelic_snp_calls( min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, random_seed: base_params.random_seed = 42, - min_minor_ac: Optional[ - base_params.min_minor_ac - ] = base_params.min_minor_ac_default, - max_missing_an: Optional[ - base_params.max_missing_an - ] = base_params.max_missing_an_default, + min_minor_ac: Optional[base_params.min_minor_ac] = None, + max_missing_an: Optional[base_params.max_missing_an] = None, n_snps: Optional[base_params.n_snps] = None, thin_offset: base_params.thin_offset = 0, ) -> xr.Dataset: @@ -1713,12 +1709,8 @@ def biallelic_diplotypes( min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, random_seed: base_params.random_seed = 42, - min_minor_ac: Optional[ - base_params.min_minor_ac - ] = base_params.min_minor_ac_default, - max_missing_an: Optional[ - base_params.max_missing_an - ] = base_params.max_missing_an_default, + min_minor_ac: Optional[base_params.min_minor_ac] = None, + max_missing_an: Optional[base_params.max_missing_an] = None, n_snps: Optional[base_params.n_snps] = None, thin_offset: base_params.thin_offset = 0, inline_array: base_params.inline_array = base_params.inline_array_default, From d2ae0776d5004adb88a1a584954c6c703eac1abc Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:43:09 +0100 Subject: [PATCH 07/12] Remove min_minor_ac_default, max_missing_an_default from biallelic_diplotype_pairwise_distances. Use pca_params.max_missing_an_default for plot_njt(). --- malariagen_data/anopheles.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index eb84403dd..324cc395e 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -35,6 +35,7 @@ hapnet_params, het_params, ihs_params, + pca_params, plotly_params, xpehh_params, ) @@ -2978,12 +2979,8 @@ def biallelic_diplotype_pairwise_distances( sample_indices: Optional[base_params.sample_indices] = None, site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, site_class: Optional[base_params.site_class] = None, - min_minor_ac: Optional[ - base_params.min_minor_ac - ] = base_params.min_minor_ac_default, - max_missing_an: Optional[ - base_params.max_missing_an - ] = base_params.max_missing_an_default, + min_minor_ac: Optional[base_params.min_minor_ac] = None, + max_missing_an: Optional[base_params.max_missing_an] = None, cohort_size: Optional[base_params.cohort_size] = None, min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, @@ -3164,10 +3161,10 @@ def plot_njt( site_class: Optional[base_params.site_class] = None, min_minor_ac: Optional[ base_params.min_minor_ac - ] = base_params.min_minor_ac_default, + ] = pca_params.min_minor_ac_default, max_missing_an: Optional[ base_params.max_missing_an - ] = base_params.max_missing_an_default, + ] = pca_params.max_missing_an_default, cohort_size: Optional[base_params.cohort_size] = None, min_cohort_size: Optional[base_params.min_cohort_size] = None, max_cohort_size: Optional[base_params.max_cohort_size] = None, From b109bcc73a889fade0a743c5535a69f1387ed13b Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:46:37 +0100 Subject: [PATCH 08/12] Simplify random n_snps, n_components in test_pca_plotting() --- tests/anoph/test_pca.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/anoph/test_pca.py b/tests/anoph/test_pca.py index 563df3184..eb30df6d6 100644 --- a/tests/anoph/test_pca.py +++ b/tests/anoph/test_pca.py @@ -88,13 +88,8 @@ def test_pca_plotting(fixture, api: AnophelesPca): # PCA parameters. n_samples = ds.sizes["samples"] n_snps_available = ds.sizes["variants"] - n_components = random.randint(3, n_samples) - - # If there are more samples than SNPs available, use all SNPs. - if n_samples > n_snps_available: - n_snps = n_snps_available - else: - n_snps = random.randint(n_samples, n_snps_available) + n_snps = random.randint(1, n_snps_available) + n_components = random.randint(2, min(n_samples, n_snps)) # Run the PCA. pca_df, pca_evr = api.pca( From a3c03e275e1ff258ea17736c4de75c322cfe64f6 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:51:38 +0100 Subject: [PATCH 09/12] debug WIP: set min_minor_ac, max_missing_an to 0 for test_pca_plotting() --- tests/anoph/test_pca.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/anoph/test_pca.py b/tests/anoph/test_pca.py index eb30df6d6..235c065c7 100644 --- a/tests/anoph/test_pca.py +++ b/tests/anoph/test_pca.py @@ -95,6 +95,8 @@ def test_pca_plotting(fixture, api: AnophelesPca): pca_df, pca_evr = api.pca( n_snps=n_snps, n_components=n_components, + min_minor_ac=0, # FIXME + max_missing_an=0, # FIXME **data_params, ) From fa831ccae114a7bfcc2e51f8bdec836345165b54 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:17:59 +0100 Subject: [PATCH 10/12] Tighten logic to apply conditions when either max_missing_an or min_minor_ac are not None, rather than truthy, to include falsy 0 --- malariagen_data/anoph/snp_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py index 8e16c3552..7b404f3a8 100644 --- a/malariagen_data/anoph/snp_data.py +++ b/malariagen_data/anoph/snp_data.py @@ -1655,7 +1655,7 @@ def biallelic_snp_calls( ds_out = xr.Dataset(coords=coords, data_vars=data_vars, attrs=ds.attrs) # Apply conditions. - if max_missing_an or min_minor_ac: + if max_missing_an is not None or min_minor_ac is not None: loc_out = np.ones(ds_out.sizes["variants"], dtype=bool) # Apply missingness condition. From e32e3653d5cc449b83e33f0d4369c8852abb33e4 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:20:44 +0100 Subject: [PATCH 11/12] Get biallelic SNPs according to PCA defaults during test_pca_plotting() --- tests/anoph/test_pca.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/anoph/test_pca.py b/tests/anoph/test_pca.py index 235c065c7..66fc24755 100644 --- a/tests/anoph/test_pca.py +++ b/tests/anoph/test_pca.py @@ -9,6 +9,7 @@ from malariagen_data import af1 as _af1 from malariagen_data import ag3 as _ag3 from malariagen_data.anoph.pca import AnophelesPca +from malariagen_data.anoph import pca_params @pytest.fixture @@ -83,7 +84,11 @@ def test_pca_plotting(fixture, api: AnophelesPca): sample_sets=random.sample(all_sample_sets, 2), site_mask=random.choice((None,) + api.site_mask_ids), ) - ds = api.biallelic_snp_calls(**data_params) + ds = api.biallelic_snp_calls( + min_minor_ac=pca_params.min_minor_ac_default, + max_missing_an=pca_params.max_missing_an_default, + **data_params, + ) # PCA parameters. n_samples = ds.sizes["samples"] @@ -95,8 +100,6 @@ def test_pca_plotting(fixture, api: AnophelesPca): pca_df, pca_evr = api.pca( n_snps=n_snps, n_components=n_components, - min_minor_ac=0, # FIXME - max_missing_an=0, # FIXME **data_params, ) From a85afea5e3d1033302fc72727e66b1e3ce8fc353 Mon Sep 17 00:00:00 2001 From: _ <4256466+leehart@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:40:36 +0100 Subject: [PATCH 12/12] Use minimum of 3 components for test_pca_plotting for plot_pca_coords_3d() --- tests/anoph/test_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/anoph/test_pca.py b/tests/anoph/test_pca.py index 66fc24755..8d452fcce 100644 --- a/tests/anoph/test_pca.py +++ b/tests/anoph/test_pca.py @@ -94,7 +94,8 @@ def test_pca_plotting(fixture, api: AnophelesPca): n_samples = ds.sizes["samples"] n_snps_available = ds.sizes["variants"] n_snps = random.randint(1, n_snps_available) - n_components = random.randint(2, min(n_samples, n_snps)) + # PC3 required for plot_pca_coords_3d() + n_components = random.randint(3, min(n_samples, n_snps)) # Run the PCA. pca_df, pca_evr = api.pca(