Skip to content

Commit

Permalink
Merge branch '536-z-score-fst' of github.com:malariagen/malariagen-da…
Browse files Browse the repository at this point in the history
…ta-python into 536-z-score-fst
  • Loading branch information
jonbrenas committed Jun 20, 2024
2 parents 1ebea03 + e1fd423 commit b0822d1
Show file tree
Hide file tree
Showing 58 changed files with 3,697 additions and 1,339 deletions.
8 changes: 8 additions & 0 deletions docs/source/Af1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,14 @@ Haplotype clustering and network analysis
plot_haplotype_network
haplotype_pairwise_distances

Diplotype clustering
--------------------
.. autosummary::
:toctree: generated/

plot_diplotype_clustering
plot_diplotype_clustering_advanced

Fst analysis
------------
.. autosummary::
Expand Down
8 changes: 8 additions & 0 deletions docs/source/Ag3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,14 @@ Haplotype clustering and network analysis
plot_haplotype_network
haplotype_pairwise_distances

Diplotype clustering
--------------------
.. autosummary::
:toctree: generated/

plot_diplotype_clustering
plot_diplotype_clustering_advanced

Fst analysis
------------
.. autosummary::
Expand Down
17 changes: 10 additions & 7 deletions malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
MAJOR_VERSION_NUMBER = 1
MAJOR_VERSION_PATH = "v1.0"
CONFIG_PATH = "v1.0-config.json"
GCS_URL = "gs://vo_afun_release/"
GCS_DEFAULT_URL = "gs://vo_afun_release/"
GCS_REGION_URLS = {
"us-central1": "gs://vo_afun_release_master_us_central1",
}
XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1"
IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1"

Expand All @@ -23,10 +26,9 @@ class Af1(AnophelesDataResource):
Parameters
----------
url : str
Base path to data. Give "gs://vo_afun_release/" to use Google Cloud
Storage, or a local path on your file system if data have been
downloaded.
url : str, optional
Base path to data. Defaults to use Google Cloud Storage, or can
be a local path on your file system if data have been downloaded.
site_filters_analysis : str, optional
Site filters analysis version.
bokeh_output_notebook : bool, optional
Expand Down Expand Up @@ -75,7 +77,7 @@ class Af1(AnophelesDataResource):

def __init__(
self,
url=GCS_URL,
url=None,
bokeh_output_notebook=True,
results_cache=None,
log=sys.stdout,
Expand Down Expand Up @@ -109,7 +111,8 @@ def __init__(
show_progress=show_progress,
check_location=check_location,
pre=pre,
gcs_url=GCS_URL,
gcs_default_url=GCS_DEFAULT_URL,
gcs_region_urls=GCS_REGION_URLS,
major_version_number=MAJOR_VERSION_NUMBER,
major_version_path=MAJOR_VERSION_PATH,
gff_gene_type="protein_coding_gene",
Expand Down
19 changes: 11 additions & 8 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys

import dask
import pandas as pd
import pandas as pd # type: ignore
import plotly.express as px # type: ignore

import malariagen_data
Expand All @@ -13,7 +13,10 @@
MAJOR_VERSION_NUMBER = 3
MAJOR_VERSION_PATH = "v3"
CONFIG_PATH = "v3-config.json"
GCS_URL = "gs://vo_agam_release/"
GCS_DEFAULT_URL = "gs://vo_agam_release/"
GCS_REGION_URLS = {
"us-central1": "gs://vo_agam_release_master_us_central1",
}
XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1"
VIRTUAL_CONTIGS = {
Expand Down Expand Up @@ -77,10 +80,9 @@ class Ag3(AnophelesDataResource):
Parameters
----------
url : str
Base path to data. Give "gs://vo_agam_release/" to use Google Cloud
Storage, or a local path on your file system if data have been
downloaded.
url : str, optional
Base path to data. Defaults to use Google Cloud Storage, or can
be a local path on your file system if data have been downloaded.
cohorts_analysis : str, optional
Cohort analysis version.
aim_analysis : str, optional
Expand Down Expand Up @@ -133,7 +135,7 @@ class Ag3(AnophelesDataResource):

def __init__(
self,
url=GCS_URL,
url=None,
bokeh_output_notebook=True,
results_cache=None,
log=sys.stdout,
Expand Down Expand Up @@ -175,7 +177,8 @@ def __init__(
show_progress=show_progress,
check_location=check_location,
pre=pre,
gcs_url=GCS_URL,
gcs_default_url=GCS_DEFAULT_URL,
gcs_region_urls=GCS_REGION_URLS,
major_version_number=MAJOR_VERSION_NUMBER,
major_version_path=MAJOR_VERSION_PATH,
gff_gene_type="gene",
Expand Down
63 changes: 45 additions & 18 deletions malariagen_data/anoph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,21 @@
)
from textwrap import dedent
import bokeh.io
import ipinfo # type: ignore
import numpy as np
import pandas as pd
import pandas as pd # type: ignore
import zarr # type: ignore
from numpydoc_decorator import doc # type: ignore
from tqdm.auto import tqdm as tqdm_auto
from tqdm.dask import TqdmCallback
from tqdm.auto import tqdm as tqdm_auto # type: ignore
from tqdm.dask import TqdmCallback # type: ignore
from yaspin import yaspin # type: ignore

from ..util import (
CacheMiss,
LoggingHelper,
check_colab_location,
check_types,
get_gcp_region,
hash_params,
init_filesystem,
)
Expand All @@ -42,9 +44,10 @@ def __init__(
url: str,
config_path: str,
pre: bool,
gcs_url: Optional[str], # only used for colab location check
major_version_number: int,
major_version_path: str,
gcs_default_url: Optional[str] = None,
gcs_region_urls: Mapping[str, str] = {},
bokeh_output_notebook: bool = False,
log: Optional[Union[str, IO]] = None,
debug: bool = False,
Expand All @@ -54,10 +57,10 @@ def __init__(
results_cache: Optional[str] = None,
tqdm_class=None,
):
self._url = url
self._config_path = config_path
self._pre = pre
self._gcs_url = gcs_url
self._gcs_default_url = gcs_default_url
self._gcs_region_urls = gcs_region_urls
self._major_version_number = major_version_number
self._major_version_path = major_version_path
self._debug = debug
Expand All @@ -69,6 +72,34 @@ def __init__(
# Set up logging.
self._log = LoggingHelper(name=__name__, out=log, debug=debug)

# Check client location.
self._client_details = None
if check_location:
try:
self._client_details = ipinfo.getHandler().getDetails()
except OSError:
pass

# Determine cloud location details.
self._gcp_region = get_gcp_region(self._client_details)

# Check colab location.
check_colab_location(self._gcp_region)

# Determine storage URL.
if url:
# User has explicitly provided a URL to use.
self._url = url
elif self._gcp_region in self._gcs_region_urls:
# Choose URL in the same GCP region.
self._url = self._gcs_region_urls[self._gcp_region]
elif self._gcs_default_url:
# Fall back to default URL if available.
self._url = self._gcs_default_url
else:
raise ValueError("A value for the `url` parameter must be provided.")
del url

# Set up fsspec filesystem. N.B., we use fsspec here to allow for
# accessing different types of storage - fsspec will automatically
# detect which type of storage to use based on the URL provided.
Expand All @@ -77,7 +108,7 @@ def __init__(
if storage_options is None:
storage_options = dict()
try:
self._fs, self._base_path = init_filesystem(url, **storage_options)
self._fs, self._base_path = init_filesystem(self._url, **storage_options)
except Exception as exc: # pragma: no cover
raise IOError(
"An error occurred establishing a connection to the storage system. Please see the nested exception for more details."
Expand All @@ -88,7 +119,9 @@ def __init__(
with self.open_file(self._config_path) as f:
self._config = json.load(f)
except Exception as exc: # pragma: no cover
if isinstance(exc, OSError) and "forbidden" in str(exc):
if (isinstance(exc, OSError) and "forbidden" in str(exc).lower()) or (
getattr(exc, "status", None) == 403
):
# This seems to be the best way to detect the case where the
# current user is trying to access GCS but has not been granted
# permissions. Reraise with a helpful message.
Expand All @@ -100,7 +133,7 @@ def __init__(
at the following link: https://forms.gle/d1NV3aL3EoVQGSHYA
If you are still experiencing problems accessing data, please email
data@malariagen.net for support.
support@malariagen.net for assistance.
"""
)
) from exc
Expand All @@ -113,14 +146,6 @@ def __init__(
if bokeh_output_notebook: # pragma: no cover
bokeh.io.output_notebook(hide_banner=True)

# Check colab location is in the US.
if check_location and self._gcs_url is not None: # pragma: no cover
self._client_details = check_colab_location(
gcs_url=self._gcs_url, url=self._url
)
else:
self._client_details = None

# Set up cache attributes.
self._cache_releases: Optional[Tuple[str, ...]] = None
self._cache_sample_sets: Dict[str, pd.DataFrame] = dict()
Expand Down Expand Up @@ -305,8 +330,10 @@ def client_location(self) -> str:
details = self._client_details
if details is not None:
region = details.region
country = details.country
country = details.country_name
location = f"{region}, {country}"
if self._gcp_region:
location += f" (Google Cloud {self._gcp_region})"
else:
location = "unknown"
return location
Expand Down
8 changes: 8 additions & 0 deletions malariagen_data/anoph/base_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,3 +278,11 @@ def validate_sample_selection_params(
missing calls.
""",
]

snp_query: TypeAlias = Annotated[
str,
"""
A pandas query string to be evaluated against the SNP data,
to select SNPs to be included
""",
]
18 changes: 18 additions & 0 deletions malariagen_data/anoph/clustering_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Parameters for hierarchical clustering functions."""

from typing import Literal

from typing_extensions import Annotated, TypeAlias

linkage_method: TypeAlias = Annotated[
Literal["single", "complete", "average", "weighted", "centroid", "median", "ward"],
"""
The linkage algorithm to use. See the Linkage Methods section of the
scipy.cluster.hierarchy.linkage docs for full descriptions.
""",
]

leaf_y: TypeAlias = Annotated[
int,
"Y coordinate at which to plot the leaf markers.",
]
27 changes: 25 additions & 2 deletions malariagen_data/anoph/cnv_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ def cnv_discordant_read_calls(
self,
contig: base_params.contigs,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.chunks_default,
) -> xr.Dataset:
Expand Down Expand Up @@ -542,6 +543,24 @@ def cnv_discordant_read_calls(

ds = simple_xarray_concat(lx, dim=DIM_VARIANT)

debug("handle sample query")
if sample_query is not None:
debug("load sample metadata")
df_samples = self.sample_metadata(sample_sets=sample_sets)

debug("align sample metadata with CNV data")
cnv_samples = ds["sample_id"].values.tolist()
df_samples_cnv = (
df_samples.set_index("sample_id").loc[cnv_samples].reset_index()
)

debug("apply the query")
loc_query_samples = df_samples_cnv.eval(sample_query).values
if np.count_nonzero(loc_query_samples) == 0:
raise ValueError(f"No samples found for query {sample_query!r}")

ds = ds.isel(samples=loc_query_samples)

return ds

@check_types
Expand Down Expand Up @@ -641,8 +660,12 @@ def plot_cnv_hmm_coverage_track(
circle_kwargs_mutable["legend_label"] = circle_kwargs_mutable.get(
"legend_label", "Coverage"
)
fig.circle(
x="variant_midpoint", y="call_NormCov", source=data, **circle_kwargs_mutable
fig.scatter(
x="variant_midpoint",
y="call_NormCov",
source=data,
marker="circle",
**circle_kwargs_mutable,
)

debug("plot the HMM state")
Expand Down
Loading

0 comments on commit b0822d1

Please sign in to comment.