Not sure what is going on

malariagen · Aug 2, 2024 · ab45528 · ab45528
2 parents df13732 + 8c9389d
commit ab45528
Show file tree

Hide file tree

Showing 11 changed files with 60 additions and 23 deletions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -53,9 +53,9 @@ For accessing data in Google Cloud Storage (GCS) you will also need to authentic
 If you are using ``malariagen_data`` from within Google Colab, authentication will be automatically
 initiated, please allow access when requested.
 
-If you are using ``malariagen_data`` from any location other than Google Colab, you will need to [set up application
-default credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc). Generally
-the best way to do this will be to [install the Google Cloud CLI](https://cloud.google.com/sdk/docs/install)
+If you are using ``malariagen_data`` from any location other than Google Colab, you will need to `set up application
+default credentials <https://cloud.google.com/docs/authentication/provide-credentials-adc>`_. Generally
+the best way to do this will be to `install the Google Cloud CLI <https://cloud.google.com/sdk/docs/install>`_
 and then run the following command::
 
    gcloud auth application-default login

diff --git a/malariagen_data/anoph/dipclust.py b/malariagen_data/anoph/dipclust.py
@@ -595,7 +595,7 @@ def plot_diplotype_clustering_advanced(
         )
 
         fig_dendro = res["figure"]
-        n_snps = res["n_snps"]
+        n_snps_cluster = res["n_snps"]
         dendro_sample_id_order = res["dendro_sample_id_order"]
 
         figures = [fig_dendro]
@@ -631,7 +631,7 @@ def plot_diplotype_clustering_advanced(
                 subplot_heights.append(cnv_row_height * n_cnv_genes)
 
         if snp_transcript:
-            snp_trace, n_snps = self._dipclust_snp_trace(
+            snp_trace, n_snps_transcript = self._dipclust_snp_trace(
                 transcript=snp_transcript,
                 sample_sets=sample_sets,
                 sample_query=sample_query,
@@ -644,7 +644,7 @@ def plot_diplotype_clustering_advanced(
 
             if snp_trace:
                 figures.append(snp_trace)
-                subplot_heights.append(snp_row_height * n_snps)
+                subplot_heights.append(snp_row_height * n_snps_transcript)
             else:
                 print(
                     f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot."
@@ -661,7 +661,7 @@ def plot_diplotype_clustering_advanced(
             sample_sets=sample_sets,
             sample_query=sample_query,
             region=region,
-            n_snps=n_snps,
+            n_snps=n_snps_cluster,
         )
 
         fig["layout"]["yaxis"]["title"] = f"Distance ({distance_metric})"

diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py
@@ -480,7 +480,7 @@ def pairwise_average_fst(
     def plot_pairwise_average_fst(
         self,
         fst_df: fst_params.df_pairwise_fst,
-        annotate_se: bool = False,
+        annotation: fst_params.annotation = None,
         zmin: Optional[plotly_params.zmin] = 0.0,
         zmax: Optional[plotly_params.zmax] = None,
         text_auto: plotly_params.text_auto = ".3f",
@@ -500,9 +500,13 @@ def plot_pairwise_average_fst(
             index = fst_df.iloc[index_key]["cohort1"]
             col = fst_df.iloc[index_key]["cohort2"]
             fst = fst_df.iloc[index_key]["fst"]
-            if annotate_se is True:
+            fig_df[index][col] = fst
+            if annotation == "standard error":
                 se = fst_df.iloc[index_key]["se"]
-                fig_df.loc[index, col] = se
+                fig_df[col][index] = se
+            elif annotation == "Z score":
+                zs = fst_df.iloc[index_key]["fst"] / fst_df.iloc[index_key]["se"]
+                fig_df[col][index] = zs
             else:
                 fig_df.loc[index, col] = fst
 

diff --git a/malariagen_data/anoph/fst_params.py b/malariagen_data/anoph/fst_params.py
@@ -1,6 +1,6 @@
 """Parameter definitions for Fst functions."""
 
-from typing import Optional
+from typing import Optional, Literal
 
 import pandas as pd
 from typing_extensions import Annotated, TypeAlias
@@ -22,3 +22,12 @@
     A dataframe of pairwise Fst and standard error values.
     """,
 ]
+
+annotation: TypeAlias = Annotated[
+    Optional[Literal["standard error", "Z score"]],
+    """
+    How to annotate the upper-right corner of the plot. Default behaviour (None) is using Fst, other options
+    are using the standard error (if annotation is 'standard error') or the Z score of the two
+    cohorts being the same (if annotation is 'Z score').
+    """,
+]
diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py
@@ -62,8 +62,12 @@ def pca(
         sample_indices: Optional[base_params.sample_indices] = None,
         site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
         site_class: Optional[base_params.site_class] = None,
-        min_minor_ac: Optional[base_params.min_minor_ac] = None,
-        max_missing_an: Optional[base_params.max_missing_an] = None,
+        min_minor_ac: Optional[
+            base_params.min_minor_ac
+        ] = pca_params.min_minor_ac_default,
+        max_missing_an: Optional[
+            base_params.max_missing_an
+        ] = pca_params.max_missing_an_default,
         cohort_size: Optional[base_params.cohort_size] = None,
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
@@ -73,7 +77,7 @@ def pca(
     ) -> Tuple[pca_params.df_pca, pca_params.evr]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
-        name = "pca_v2"
+        name = "pca_v3"
 
         # Normalize params for consistent hash value.
         (

diff --git a/malariagen_data/anoph/pca_params.py b/malariagen_data/anoph/pca_params.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 from typing_extensions import Annotated, TypeAlias
+from . import base_params
 
 n_components: TypeAlias = Annotated[
     int,
@@ -23,3 +24,7 @@
     np.ndarray,
     "An array of explained variance ratios, one per component.",
 ]
+
+min_minor_ac_default: base_params.min_minor_ac = 2
+
+max_missing_an_default: base_params.max_missing_an = 0
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
@@ -1655,7 +1655,7 @@ def biallelic_snp_calls(
             ds_out = xr.Dataset(coords=coords, data_vars=data_vars, attrs=ds.attrs)
 
             # Apply conditions.
-            if max_missing_an or min_minor_ac:
+            if max_missing_an is not None or min_minor_ac is not None:
                 loc_out = np.ones(ds_out.sizes["variants"], dtype=bool)
 
                 # Apply missingness condition.

diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
@@ -35,6 +35,7 @@
     hapnet_params,
     het_params,
     ihs_params,
+    pca_params,
     plotly_params,
     xpehh_params,
 )
@@ -3158,8 +3159,12 @@ def plot_njt(
         sample_indices: Optional[base_params.sample_indices] = None,
         site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
         site_class: Optional[base_params.site_class] = None,
-        min_minor_ac: Optional[base_params.min_minor_ac] = None,
-        max_missing_an: Optional[base_params.max_missing_an] = None,
+        min_minor_ac: Optional[
+            base_params.min_minor_ac
+        ] = pca_params.min_minor_ac_default,
+        max_missing_an: Optional[
+            base_params.max_missing_an
+        ] = pca_params.max_missing_an_default,
         cohort_size: Optional[base_params.cohort_size] = None,
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,

diff --git a/notebooks/plot_pairwise_average_fst.ipynb b/notebooks/plot_pairwise_average_fst.ipynb
@@ -94,7 +94,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ag3.plot_pairwise_average_fst(pairwise_fst_df, annotate_se=True)"
+    "ag3.plot_pairwise_average_fst(pairwise_fst_df, annotation=\"standard error\")"
    ]
   },
   {
@@ -104,7 +104,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ag3.plot_pairwise_average_fst(pairwise_fst_df, annotate_se=False)"
+    "ag3.plot_pairwise_average_fst(pairwise_fst_df, annotation=\"Z score\")"
    ]
   },
   {

diff --git a/tests/anoph/test_fst.py b/tests/anoph/test_fst.py
@@ -204,7 +204,11 @@ def check_pairwise_average_fst(api: AnophelesFstAnalysis, fst_params):
     if len(fst_df) > 0:
         fig = api.plot_pairwise_average_fst(fst_df, show=False)
         assert isinstance(fig, go.Figure)
-        fig = api.plot_pairwise_average_fst(fst_df, annotate_se=True, show=False)
+        fig = api.plot_pairwise_average_fst(
+            fst_df, annotation="standard error", show=False
+        )
+        assert isinstance(fig, go.Figure)
+        fig = api.plot_pairwise_average_fst(fst_df, annotation="Z score", show=False)
         assert isinstance(fig, go.Figure)
 
 

diff --git a/tests/anoph/test_pca.py b/tests/anoph/test_pca.py
@@ -9,6 +9,7 @@
 from malariagen_data import af1 as _af1
 from malariagen_data import ag3 as _ag3
 from malariagen_data.anoph.pca import AnophelesPca
+from malariagen_data.anoph import pca_params
 
 
 @pytest.fixture
@@ -83,13 +84,18 @@ def test_pca_plotting(fixture, api: AnophelesPca):
         sample_sets=random.sample(all_sample_sets, 2),
         site_mask=random.choice((None,) + api.site_mask_ids),
     )
-    ds = api.biallelic_snp_calls(**data_params)
+    ds = api.biallelic_snp_calls(
+        min_minor_ac=pca_params.min_minor_ac_default,
+        max_missing_an=pca_params.max_missing_an_default,
+        **data_params,
+    )
 
     # PCA parameters.
     n_samples = ds.sizes["samples"]
     n_snps_available = ds.sizes["variants"]
-    n_snps = random.randint(n_samples, n_snps_available)
-    n_components = random.randint(3, n_samples)
+    n_snps = random.randint(1, n_snps_available)
+    # PC3 required for plot_pca_coords_3d()
+    n_components = random.randint(3, min(n_samples, n_snps))
 
     # Run the PCA.
     pca_df, pca_evr = api.pca(