diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py index 0fe6c4113..47b8b8c98 100644 --- a/malariagen_data/af1.py +++ b/malariagen_data/af1.py @@ -13,6 +13,7 @@ H12_CALIBRATION_CACHE_NAME = "af1_h12_calibration_v1" H12_GWSS_CACHE_NAME = "af1_h12_gwss_v1" G123_GWSS_CACHE_NAME = "af1_g123_gwss_v1" +XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1" G123_CALIBRATION_CACHE_NAME = "af1_g123_calibration_v1" H1X_GWSS_CACHE_NAME = "af1_h1x_gwss_v1" IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1" @@ -75,6 +76,7 @@ class Af1(AnophelesDataResource): _h12_calibration_cache_name = H12_CALIBRATION_CACHE_NAME _h12_gwss_cache_name = H12_GWSS_CACHE_NAME _g123_gwss_cache_name = G123_GWSS_CACHE_NAME + _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index 1b2bd2178..590e76ba6 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -42,6 +42,7 @@ H12_GWSS_CACHE_NAME = "ag3_h12_gwss_v1" G123_CALIBRATION_CACHE_NAME = "ag3_g123_calibration_v1" G123_GWSS_CACHE_NAME = "ag3_g123_gwss_v1" +XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1" H1X_GWSS_CACHE_NAME = "ag3_h1x_gwss_v1" IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1" @@ -138,6 +139,7 @@ class Ag3(AnophelesDataResource): _h12_calibration_cache_name = H12_CALIBRATION_CACHE_NAME _h12_gwss_cache_name = H12_GWSS_CACHE_NAME _g123_gwss_cache_name = G123_GWSS_CACHE_NAME + _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME _g123_calibration_cache_name = G123_CALIBRATION_CACHE_NAME _h1x_gwss_cache_name = H1X_GWSS_CACHE_NAME _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME diff --git a/malariagen_data/anoph/xpehh_params.py b/malariagen_data/anoph/xpehh_params.py new file mode 100644 index 000000000..d8dab0016 --- /dev/null +++ b/malariagen_data/anoph/xpehh_params.py @@ -0,0 +1,75 @@ +"""Parameter definitions for XPEHH analysis functions.""" + +from typing import Tuple, Union + +import numpy as np +from typing_extensions import Annotated, TypeAlias + +from . import base_params + +window_size: TypeAlias = Annotated[ + int, + """ + The size of window in number of SNPs used to summarise XP-EHH over. + If None, per-variant XP-EHH values are returned. + """, +] +window_size_default: window_size = 200 +min_cohort_size_default: base_params.min_cohort_size = 15 +max_cohort_size_default: base_params.max_cohort_size = 50 +percentiles: TypeAlias = Annotated[ + Union[int, Tuple[int, ...]], + """ + If window size is specified, this returns the XP-EHH percentiles + for each window. + """, +] +percentiles_default: percentiles = (50, 75, 100) +filter_min_maf: TypeAlias = Annotated[ + float, + """ + Minimum minor allele frequency to use for filtering prior to passing + haplotypes to allel.xpehh function + """, +] +filter_min_maf_default: filter_min_maf = 0.05 +map_pos: TypeAlias = Annotated[ + np.ndarray, + """ + Variant positions (genetic map distance). + """, +] +min_ehh: TypeAlias = Annotated[ + float, + """ + Minimum EHH beyond which to truncate integrated haplotype homozygosity + calculation. + """, +] +min_ehh_default: min_ehh = 0.05 +max_gap: TypeAlias = Annotated[ + int, + """ + Do not report scores if EHH spans a gap larger than this number of + base pairs. + """, +] +max_gap_default: max_gap = 200_000 +gap_scale: TypeAlias = Annotated[ + int, "Rescale distance between variants if gap is larger than this value." +] +gap_scale_default: gap_scale = 20_000 +include_edges: TypeAlias = Annotated[ + bool, + """ + If True, report scores even if EHH does not decay below min_ehh at the + end of the chromosome. + """, +] +use_threads: TypeAlias = Annotated[ + bool, "If True, use multiple threads to compute XP-EHH." +] +palette: TypeAlias = Annotated[ + str, "Name of bokeh palette to use for plotting multiple percentiles." +] +palette_default: palette = "Blues" diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 2418e5305..858faf6fd 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -36,6 +36,7 @@ map_params, pca_params, plotly_params, + xpehh_params, ) from .anoph.aim_data import AnophelesAimData from .anoph.base import AnophelesBase @@ -179,6 +180,11 @@ def _h12_gwss_cache_name(self): def _g123_gwss_cache_name(self): raise NotImplementedError("Must override _g123_gwss_cache_name") + @property + @abstractmethod + def _xpehh_gwss_cache_name(self): + raise NotImplementedError("Must override _xpehh_gwss_cache_name") + @property @abstractmethod def _h1x_gwss_cache_name(self): @@ -4406,6 +4412,98 @@ def plot_ihs_gwss_track( return fig @check_types + @doc( + summary="Run and plot XP-EHH GWSS data.", + ) + def plot_xpehh_gwss( + self, + contig: base_params.contig, + analysis: hap_params.analysis = DEFAULT, + sample_sets: Optional[base_params.sample_sets] = None, + cohort1_query: Optional[base_params.sample_query] = None, + cohort2_query: Optional[base_params.sample_query] = None, + window_size: xpehh_params.window_size = xpehh_params.window_size_default, + percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, + filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default, + map_pos: Optional[xpehh_params.map_pos] = None, + min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default, + max_gap: xpehh_params.max_gap = xpehh_params.max_gap_default, + gap_scale: xpehh_params.gap_scale = xpehh_params.gap_scale_default, + include_edges: xpehh_params.include_edges = True, + use_threads: xpehh_params.use_threads = True, + min_cohort_size: Optional[ + base_params.min_cohort_size + ] = xpehh_params.min_cohort_size_default, + max_cohort_size: Optional[ + base_params.max_cohort_size + ] = xpehh_params.max_cohort_size_default, + random_seed: base_params.random_seed = 42, + palette: xpehh_params.palette = xpehh_params.palette_default, + title: Optional[gplt_params.title] = None, + sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default, + width: gplt_params.width = gplt_params.width_default, + track_height: gplt_params.track_height = 170, + genes_height: gplt_params.genes_height = gplt_params.genes_height_default, + show: gplt_params.show = True, + output_backend: gplt_params.output_backend = gplt_params.output_backend_default, + ) -> gplt_params.figure: + # gwss track + fig1 = self.plot_xpehh_gwss_track( + contig=contig, + analysis=analysis, + sample_sets=sample_sets, + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, + window_size=window_size, + percentiles=percentiles, + palette=palette, + filter_min_maf=filter_min_maf, + map_pos=map_pos, + min_ehh=min_ehh, + max_gap=max_gap, + gap_scale=gap_scale, + include_edges=include_edges, + use_threads=use_threads, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + random_seed=random_seed, + title=title, + sizing_mode=sizing_mode, + width=width, + height=track_height, + show=False, + x_range=None, + output_backend=output_backend, + ) + + fig1.xaxis.visible = False + + # plot genes + fig2 = self.plot_genes( + region=contig, + sizing_mode=sizing_mode, + width=width, + height=genes_height, + x_range=fig1.x_range, + show=False, + output_backend=output_backend, + ) + + # combine plots into a single figure + fig = bokeh.layouts.gridplot( + [fig1, fig2], + ncols=1, + toolbar_location="above", + merge_tools=True, + sizing_mode=sizing_mode, + ) + + if show: + bokeh.plotting.show(fig) + return None + else: + return fig + @doc( summary="Run and plot iHS GWSS data.", ) @@ -5012,6 +5110,295 @@ def plot_g123_calibration( return fig @check_types + @doc( + summary="Run XP-EHH GWSS.", + returns=dict( + x="An array containing the window centre point genomic positions.", + xpehh="An array with XP-EHH statistic values for each window.", + ), + ) + def xpehh_gwss( + self, + contig: base_params.contig, + analysis: hap_params.analysis = DEFAULT, + sample_sets: Optional[base_params.sample_sets] = None, + cohort1_query: Optional[base_params.sample_query] = None, + cohort2_query: Optional[base_params.sample_query] = None, + window_size: xpehh_params.window_size = xpehh_params.window_size_default, + percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, + filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default, + map_pos: Optional[xpehh_params.map_pos] = None, + min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default, + max_gap: xpehh_params.max_gap = xpehh_params.max_gap_default, + gap_scale: xpehh_params.gap_scale = xpehh_params.gap_scale_default, + include_edges: xpehh_params.include_edges = True, + use_threads: xpehh_params.use_threads = True, + min_cohort_size: Optional[ + base_params.min_cohort_size + ] = xpehh_params.min_cohort_size_default, + max_cohort_size: Optional[ + base_params.max_cohort_size + ] = xpehh_params.max_cohort_size_default, + random_seed: base_params.random_seed = 42, + ) -> Tuple[np.ndarray, np.ndarray]: + # change this name if you ever change the behaviour of this function, to + # invalidate any previously cached data + name = self._xpehh_gwss_cache_name + + params = dict( + contig=contig, + analysis=self._prep_phasing_analysis_param(analysis=analysis), + window_size=window_size, + percentiles=percentiles, + filter_min_maf=filter_min_maf, + map_pos=map_pos, + min_ehh=min_ehh, + include_edges=include_edges, + max_gap=max_gap, + gap_scale=gap_scale, + use_threads=use_threads, + sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets), + # N.B., do not be tempted to convert this sample query into integer + # indices using _prep_sample_selection_params, because the indices + # are different in the haplotype data. + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + random_seed=random_seed, + ) + + try: + results = self.results_cache_get(name=name, params=params) + + except CacheMiss: + results = self._xpehh_gwss(**params) # self. + self.results_cache_set(name=name, params=params, results=results) + + x = results["x"] + xpehh = results["xpehh"] + + return x, xpehh + + def _xpehh_gwss( + self, + *, + contig, + analysis, + sample_sets, + cohort1_query, + cohort2_query, + window_size, + percentiles, + filter_min_maf, + map_pos, + min_ehh, + max_gap, + gap_scale, + include_edges, + use_threads, + min_cohort_size, + max_cohort_size, + random_seed, + ): + ds_haps1 = self.haplotypes( + region=contig, + analysis=analysis, + sample_query=cohort1_query, + sample_sets=sample_sets, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + random_seed=random_seed, + ) + + ds_haps2 = self.haplotypes( + region=contig, + analysis=analysis, + sample_query=cohort2_query, + sample_sets=sample_sets, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + random_seed=random_seed, + ) + + gt1 = allel.GenotypeDaskArray(ds_haps1["call_genotype"].data) + gt2 = allel.GenotypeDaskArray(ds_haps2["call_genotype"].data) + with self._dask_progress(desc="Load haplotypes"): + ht1 = gt1.to_haplotypes().compute() + ht2 = gt2.to_haplotypes().compute() + + ac1 = ht1.count_alleles(max_allele=1) + ac2 = ht2.count_alleles(max_allele=1) + pos = ds_haps1["variant_position"].values + + if filter_min_maf > 0: + ac = ac1 + ac2 + af = ac.to_frequencies() + maf = np.min(af, axis=1) + maf_filter = maf > filter_min_maf + + ht1 = ht1.compress(maf_filter, axis=0) + ht2 = ht2.compress(maf_filter, axis=0) + pos = pos[maf_filter] + ac1 = ac1[maf_filter] + ac2 = ac2[maf_filter] + + # compute XP-EHH + xp = allel.xpehh( + h1=ht1, + h2=ht2, + pos=pos, + map_pos=map_pos, + min_ehh=min_ehh, + include_edges=include_edges, + max_gap=max_gap, + gap_scale=gap_scale, + use_threads=use_threads, + ) + + # remove any NaNs + na_mask = ~np.isnan(xp) + xp = xp[na_mask] + pos = pos[na_mask] + ac1 = ac1[na_mask] + ac2 = ac2[na_mask] + + if window_size: + xp = allel.moving_statistic( + xp, statistic=np.percentile, size=window_size, q=percentiles + ) + pos = allel.moving_statistic(pos, statistic=np.mean, size=window_size) + + results = dict(x=pos, xpehh=xp) + + return results + + @doc( + summary="Run and plot XP-EHH GWSS data.", + ) + def plot_xpehh_gwss_track( + self, + contig: base_params.contig, + analysis: hap_params.analysis = DEFAULT, + sample_sets: Optional[base_params.sample_sets] = None, + cohort1_query: Optional[base_params.sample_query] = None, + cohort2_query: Optional[base_params.sample_query] = None, + window_size: xpehh_params.window_size = xpehh_params.window_size_default, + percentiles: xpehh_params.percentiles = xpehh_params.percentiles_default, + filter_min_maf: xpehh_params.filter_min_maf = xpehh_params.filter_min_maf_default, + map_pos: Optional[xpehh_params.map_pos] = None, + min_ehh: xpehh_params.min_ehh = xpehh_params.min_ehh_default, + max_gap: xpehh_params.max_gap = xpehh_params.max_gap_default, + gap_scale: xpehh_params.gap_scale = xpehh_params.gap_scale_default, + include_edges: xpehh_params.include_edges = True, + use_threads: xpehh_params.use_threads = True, + min_cohort_size: Optional[ + base_params.min_cohort_size + ] = xpehh_params.min_cohort_size_default, + max_cohort_size: Optional[ + base_params.max_cohort_size + ] = xpehh_params.max_cohort_size_default, + random_seed: base_params.random_seed = 42, + palette: xpehh_params.palette = xpehh_params.palette_default, + title: Optional[gplt_params.title] = None, + sizing_mode: gplt_params.sizing_mode = gplt_params.sizing_mode_default, + width: gplt_params.width = gplt_params.width_default, + height: gplt_params.height = 200, + show: gplt_params.show = True, + x_range: Optional[gplt_params.x_range] = None, + output_backend: gplt_params.output_backend = gplt_params.output_backend_default, + ) -> gplt_params.figure: + # compute xpehh + x, xpehh = self.xpehh_gwss( + contig=contig, + analysis=analysis, + window_size=window_size, + percentiles=percentiles, + filter_min_maf=filter_min_maf, + map_pos=map_pos, + min_ehh=min_ehh, + max_gap=max_gap, + gap_scale=gap_scale, + include_edges=include_edges, + use_threads=use_threads, + min_cohort_size=min_cohort_size, + max_cohort_size=max_cohort_size, + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, + sample_sets=sample_sets, + random_seed=random_seed, + ) + + # determine X axis range + x_min = x[0] + x_max = x[-1] + if x_range is None: + x_range = bokeh.models.Range1d(x_min, x_max, bounds="auto") + + # create a figure + xwheel_zoom = bokeh.models.WheelZoomTool( + dimensions="width", maintain_focus=False + ) + if title is None: + if cohort1_query is None or cohort2_query is None: + title = "XP-EHH" + else: + title = f"Cohort 1: {cohort1_query}\nCohort 2: {cohort2_query}" + fig = bokeh.plotting.figure( + title=title, + tools=["xpan", "xzoom_in", "xzoom_out", xwheel_zoom, "reset"], + active_scroll=xwheel_zoom, + active_drag="xpan", + sizing_mode=sizing_mode, + width=width, + height=height, + toolbar_location="above", + x_range=x_range, + output_backend=output_backend, + ) + + if window_size: + if isinstance(percentiles, int): + percentiles = (percentiles,) + # Ensure percentiles are sorted so that colors make sense. + percentiles = tuple(sorted(percentiles)) + + # add an empty dimension to XP-EHH array if 1D + xpehh = np.reshape(xpehh, (xpehh.shape[0], -1)) + + # select the base color palette to work from + base_palette = bokeh.palettes.all_palettes[palette][8] + + # keep only enough colours to plot the XP-EHH tracks + bokeh_palette = base_palette[: xpehh.shape[1]] + + # reverse the colors so darkest is last + bokeh_palette = bokeh_palette[::-1] + + for i in range(xpehh.shape[1]): + xpehh_perc = xpehh[:, i] + color = bokeh_palette[i] + + # plot XP-EHH + fig.circle( + x=x, + y=xpehh_perc, + size=4, + line_width=0, + line_color=color, + fill_color=color, + ) + + # tidy up the plot + fig.yaxis.axis_label = "XP-EHH" + self._bokeh_style_genome_xaxis(fig, contig) + + if show: + bokeh.plotting.show(fig) + return None + else: + return fig + @doc( summary=""" Hierarchically cluster haplotypes in region and produce an interactive plot. diff --git a/notebooks/plot_xpehh_gwss.ipynb b/notebooks/plot_xpehh_gwss.ipynb new file mode 100644 index 000000000..7d0b3005c --- /dev/null +++ b/notebooks/plot_xpehh_gwss.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "32067e1d", + "metadata": {}, + "outputs": [], + "source": [ + "import malariagen_data\n", + "\n", + "ag3 = malariagen_data.Ag3(\n", + " \"simplecache::gs://vo_agam_release\",\n", + " simplecache=dict(cache_storage=\"../gcs_cache\"),\n", + ")\n", + "ag3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "966049ec", + "metadata": {}, + "outputs": [], + "source": [ + "af1 = malariagen_data.Af1(\n", + " \"simplecache::gs://vo_afun_release\",\n", + " simplecache=dict(cache_storage=\"../gcs_cache\"),\n", + ")\n", + "af1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "838eddc4", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss(\n", + " contig=\"X\",\n", + " window_size=1_000,\n", + " analysis=\"gamb_colu\",\n", + " percentiles=(50, 60, 90, 100),\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " max_cohort_size=10,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e780aa23", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss_track(\n", + " contig=\"X\",\n", + " window_size=1_000,\n", + " analysis=\"gamb_colu\",\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " max_cohort_size=10,\n", + " filter_min_maf=0.1,\n", + " width=700,\n", + " sizing_mode=\"fixed\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b26a9654", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss(\n", + " contig=\"X\",\n", + " window_size=100,\n", + " analysis=\"gamb_colu\",\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " max_cohort_size=10,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "181227f6", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss(\n", + " contig=\"2RL\",\n", + " window_size=100,\n", + " analysis=\"gamb_colu\",\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " max_cohort_size=10,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89cf50ac", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_xpehh_gwss(\n", + " contig=\"X\",\n", + " window_size=1_000,\n", + " analysis=\"gamb_colu\",\n", + " cohort1_query=\"cohort_admin2_year == 'ML-2_Kati_colu_2014'\",\n", + " cohort2_query=\"country == 'Mayotte'\",\n", + " min_ehh=0.1,\n", + " max_cohort_size=None,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78b452f5", + "metadata": {}, + "outputs": [], + "source": [ + "af1.sample_metadata()[\"cohort_admin1_year\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "079a3308", + "metadata": {}, + "outputs": [], + "source": [ + "af1.plot_xpehh_gwss(\n", + " contig=\"X\",\n", + " window_size=2_000,\n", + " cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n", + " cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n", + " max_cohort_size=20,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6290896", + "metadata": {}, + "outputs": [], + "source": [ + "af1.plot_xpehh_gwss(\n", + " contig=\"2RL\",\n", + " window_size=200,\n", + " cohort1_query=\"cohort_admin1_year == 'MZ-L_fune_2018'\",\n", + " cohort2_query=\"cohort_admin1_year == 'GA-2_fune_2017'\",\n", + " max_cohort_size=20,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5aed96d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_ag3.py b/tests/test_ag3.py index 045291018..d916e23a3 100644 --- a/tests/test_ag3.py +++ b/tests/test_ag3.py @@ -2265,6 +2265,37 @@ def test_ihs_gwss(): assert_allclose(ihs[:, 2][100], 2.3467595962486327) +def test_xpehh_gwss(): + ag3 = setup_ag3() + cohort1_query = "country == 'Ghana'" + cohort2_query = "country == 'Angola'" + contig = "3L" + analysis = "gamb_colu" + sample_sets = "3.0" + window_size = 1000 + + x, xpehh = ag3.xpehh_gwss( + contig=contig, + analysis=analysis, + cohort1_query=cohort1_query, + cohort2_query=cohort2_query, + sample_sets=sample_sets, + window_size=window_size, + max_cohort_size=20, + ) + + assert isinstance(x, np.ndarray) + assert isinstance(xpehh, np.ndarray) + + # check dimensions + assert len(x) == 399 + assert len(x) == len(xpehh) + + # check some values + assert_allclose(x[0], 467448.348) + assert_allclose(xpehh[:, 2][100], 0.4817561326426265) + + def test_g123_gwss(): ag3 = setup_ag3() sample_query = "country == 'Ghana'"