diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index dc2d7fb3e..de7382c47 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -711,21 +711,21 @@ def snp_effects(self, transcript, site_mask=None, site_filters="dt_20200416"): def snp_allele_frequencies( self, transcript, - populations, + cohorts, site_mask=None, site_filters="dt_20200416", species_calls=("20200422", "aim"), sample_sets="v3_wild", drop_invariant=True, ): - """Compute per variant population allele frequencies for a gene transcript. + """Compute per variant allele frequencies for a gene transcript. Parameters ---------- transcript : str Gene transcript ID (AgamP4.12), e.g., "AGAP004707-RA". - populations : dict - Dictionary to map population IDs to sample queries, e.g., + cohorts : dict + Dictionary to map cohort IDs to sample queries, e.g., {"bf_2012_col": "country == 'Burkina Faso' and year == 2012 and species == 'coluzzii'"} site_mask : {"gamb_colu_arab", "gamb_colu", "arab"} Site filters mask to apply. @@ -738,7 +738,7 @@ def snp_allele_frequencies( identifiers (e.g., ["AG1000G-BF-A", "AG1000G-BF-B"]) or a release identifier (e.g., "v3") or a list of release identifiers. drop_invariant : bool, optional - If True, variants with no alternate allele calls in any populations are dropped from + If True, variants with no alternate allele calls in any cohorts are dropped from the result. Returns @@ -792,13 +792,13 @@ def snp_allele_frequencies( # count alleles afs = dict() - for pop, query in populations.items(): - loc_pop = df_meta.eval(query).values - gt_pop = da.compress(loc_pop, gt, axis=1) - ac_pop = ( - allel.GenotypeDaskArray(gt_pop).count_alleles(max_allele=3).compute() + for coh, query in cohorts.items(): + loc_coh = df_meta.eval(query).values + gt_coh = da.compress(loc_coh, gt, axis=1) + ac_coh = ( + allel.GenotypeDaskArray(gt_coh).count_alleles(max_allele=3).compute() ) - afs[pop] = ac_pop.to_frequencies() + afs[coh] = ac_coh.to_frequencies() # set up columns cols = { @@ -807,14 +807,14 @@ def snp_allele_frequencies( "alt_allele": alt.astype("U1").flatten(), } - for pop in populations: - cols[pop] = afs[pop][:, 1:].flatten() + for coh in cohorts: + cols[coh] = afs[coh][:, 1:].flatten() # build df df = pandas.DataFrame(cols) # add max allele freq column - df["max_af"] = df[populations].max(axis=1) + df["max_af"] = df[cohorts].max(axis=1) # drop invariants if drop_invariant: @@ -1602,16 +1602,16 @@ def gene_cnv(self, contig, sample_sets="v3_wild"): return ds_out - def gene_cnv_frequencies(self, contig, populations, sample_sets="v3_wild"): + def gene_cnv_frequencies(self, contig, cohorts=None, sample_sets="v3_wild"): """Compute modal copy number by gene, then compute the frequency of - amplifications and deletions by population, from HMM data. + amplifications and deletions in one or more cohorts, from HMM data. Parameters ---------- contig : str Chromosome arm, e.g., "3R". - populations : dict - Dictionary to map population IDs to sample queries, e.g., + cohorts : dict + Dictionary to map cohort IDs to sample queries, e.g., {"bf_2012_col": "country == 'Burkina Faso' and year == 2012 and species == 'coluzzii'"} sample_sets : str or list of str Can be a sample set identifier (e.g., "AG1000G-AO") or a list of sample set @@ -1650,20 +1650,20 @@ def gene_cnv_frequencies(self, contig, populations, sample_sets="v3_wild"): is_amp = cn > expected_cn is_del = (0 <= cn) & (cn < expected_cn) - # compute population frequencies - for pop, query in populations.items(): + # compute cohort frequencies + for coh, query in cohorts.items(): loc_samples = df_samples.eval(query).values n_samples = np.count_nonzero(loc_samples) if n_samples == 0: - raise ValueError(f"no samples for population {pop!r}") - is_amp_pop = np.compress(loc_samples, is_amp, axis=1) - is_del_pop = np.compress(loc_samples, is_del, axis=1) - amp_count_pop = np.sum(is_amp_pop, axis=1) - del_count_pop = np.sum(is_del_pop, axis=1) - amp_freq_pop = amp_count_pop / n_samples - del_freq_pop = del_count_pop / n_samples - df[f"{pop}_amp"] = amp_freq_pop - df[f"{pop}_del"] = del_freq_pop + raise ValueError(f"no samples for cohort {coh!r}") + is_amp_coh = np.compress(loc_samples, is_amp, axis=1) + is_del_coh = np.compress(loc_samples, is_del, axis=1) + amp_count_coh = np.sum(is_amp_coh, axis=1) + del_count_coh = np.sum(is_del_coh, axis=1) + amp_freq_coh = amp_count_coh / n_samples + del_freq_coh = del_count_coh / n_samples + df[f"{coh}_amp"] = amp_freq_coh + df[f"{coh}_del"] = del_freq_coh # set gene ID as index for convenience df.set_index("ID", inplace=True) diff --git a/poetry.lock b/poetry.lock index 025bdfafa..9de4e5cda 100644 --- a/poetry.lock +++ b/poetry.lock @@ -215,14 +215,6 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -[[package]] -name = "cython" -version = "0.29.23" -description = "The Cython compiler for writing C extensions for the Python language." -category = "main" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" - [[package]] name = "dask" version = "2021.3.0" @@ -306,7 +298,7 @@ python-versions = "*" [[package]] name = "fsspec" -version = "2021.4.0" +version = "2021.5.0" description = "File-system specification" category = "main" optional = false @@ -331,7 +323,7 @@ ssh = ["paramiko"] [[package]] name = "gcsfs" -version = "2021.4.0" +version = "2021.5.0" description = "Convenient Filesystem interface over GCS" category = "main" optional = false @@ -340,7 +332,7 @@ python-versions = ">=3.6" [package.dependencies] aiohttp = "*" decorator = "*" -fsspec = "2021.04.0" +fsspec = "2021.05.0" google-auth = ">=1.2" google-auth-oauthlib = "*" requests = "*" @@ -1125,19 +1117,18 @@ pyasn1 = ">=0.1.3" [[package]] name = "scikit-allel" -version = "1.3.3" +version = "1.3.5" description = "A Python package for exploring and analysing genetic variation data." category = "main" optional = false python-versions = "*" [package.dependencies] -cython = "*" dask = {version = "*", extras = ["array"]} numpy = "*" [package.extras] -full = ["scipy", "matplotlib", "seaborn", "pandas", "scikit-learn", "h5py", "numexpr", "bcolz", "zarr", "hmmlearn", "pomegranate", "nose"] +full = ["scipy", "matplotlib", "seaborn", "pandas", "scikit-learn", "h5py", "numexpr", "zarr", "hmmlearn", "pomegranate", "nose"] [[package]] name = "scipy" @@ -1545,43 +1536,6 @@ colorama = [ {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, ] -cython = [ - {file = "Cython-0.29.23-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff885f18d169759b57f116d3956e45cd2b9cba989fde348bba091544c668dc11"}, - {file = "Cython-0.29.23-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:3b29224eb62309a10819d923dc6262f769e4f3facfee3cd06372c355e5b38b33"}, - {file = "Cython-0.29.23-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:355a6e768d91e21fbf477b61881bab64b7a2da386a166898997bccefd532cf5d"}, - {file = "Cython-0.29.23-cp27-cp27m-win32.whl", hash = "sha256:2af52d312e96b38ded38b34d06e22685c226b1b0e58278bd27209f5d2385d115"}, - {file = "Cython-0.29.23-cp27-cp27m-win_amd64.whl", hash = "sha256:519fccf526d26b377e1db22f22aa44889b28bc5833ec106588cb13557e8ba2da"}, - {file = "Cython-0.29.23-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:625a16103770fd92b487b701fb0c07e5790b080f40fa11ce572a2d56d9e9fcca"}, - {file = "Cython-0.29.23-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:7b7a766726d207d7cd57aff0fcb4b35ce042d3cc88a421fcdb45eeb61a5b9d12"}, - {file = "Cython-0.29.23-cp34-cp34m-win32.whl", hash = "sha256:474c1a29ab43e29d990df279e2cf6aa96baa9208f5cd4bc76ac87ffcdf1e2945"}, - {file = "Cython-0.29.23-cp34-cp34m-win_amd64.whl", hash = "sha256:f4aca6bffb1c1c3c4ada3347d0b162a699c18a66e097ee08b63b3a35118fdfcc"}, - {file = "Cython-0.29.23-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:58dc06871bfdb0592542d779714fe9f918e11ba20ac07757dd63b198bdc704fe"}, - {file = "Cython-0.29.23-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:2a3bbce689a2fddb85aa66712d93875c99bf7f64ac82b1d149ecce522a7a4e0c"}, - {file = "Cython-0.29.23-cp35-cp35m-win32.whl", hash = "sha256:3ef530f975e3a760e7282fce2a25f900fa63f96d17321b4aa5f5542eb9859cdf"}, - {file = "Cython-0.29.23-cp35-cp35m-win_amd64.whl", hash = "sha256:ef21c51350462160456eb71df31b0869e5141e940f22c61c358bdb6e3ebc3388"}, - {file = "Cython-0.29.23-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:20402ef316393168909926ab21848aa6e08e39bed5003b657139774e66166cd0"}, - {file = "Cython-0.29.23-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:4858043ac5f96a8f0277cf63760bb39b9521c1f897678cf1d22423f3e758f4ed"}, - {file = "Cython-0.29.23-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4d7c3b0882d8757c601eaf288fc0d321d5c7ac6c3afb8c42eddf9325a3419cf5"}, - {file = "Cython-0.29.23-cp36-cp36m-win32.whl", hash = "sha256:37ff66039e3d138ec968ee1d1e12441fa5fb4e6a9c5458bc3c3a232f01be4a7d"}, - {file = "Cython-0.29.23-cp36-cp36m-win_amd64.whl", hash = "sha256:5be3ae3189cf7d0e9bbeafb854496dc7030c6f6a5602d809435fab8223543a41"}, - {file = "Cython-0.29.23-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b0699f0dc90181f2458fdb8170455e7798a309e18f41379eda7a2dc8c7aadee0"}, - {file = "Cython-0.29.23-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:41cd0dd2ff5d78466e73409db509887a84449b400074d4f217980cedbb18e4be"}, - {file = "Cython-0.29.23-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:aa3bb0928fb2aa3a8828801eb8b29af2261c199f805ae835467489e2bdd00372"}, - {file = "Cython-0.29.23-cp37-cp37m-win32.whl", hash = "sha256:20cb50d9fede8029bdb50875458f07a27f909289aeed4cdb9c19544dd9a9bc45"}, - {file = "Cython-0.29.23-cp37-cp37m-win_amd64.whl", hash = "sha256:c4b82461edbbcf90f19b319006345b77474a2d7514e1476d49a14bbd55d6b797"}, - {file = "Cython-0.29.23-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:794e3df0b57e16bce7583ac909126f4cb381fe566adadb20484d89095855eedb"}, - {file = "Cython-0.29.23-cp38-cp38-manylinux1_i686.whl", hash = "sha256:0c4b9f7e3aa004cf3f364e3e772f55fec5740485bafea99d1f13bdc9bbd8a545"}, - {file = "Cython-0.29.23-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:ceccc03b633113ede1f14ad914a6db5c278ce108c8ddb308a5c01c1567d8a02a"}, - {file = "Cython-0.29.23-cp38-cp38-win32.whl", hash = "sha256:4b0bcf2e06a9063fc78c3243ed4003228375d532ef13b9e5d7183be8f0a52cf5"}, - {file = "Cython-0.29.23-cp38-cp38-win_amd64.whl", hash = "sha256:5a6792153b728a0240e55bbb5b643f4f7e45c76319e03abf15bf367471ea1d1a"}, - {file = "Cython-0.29.23-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a8eed9c82e8fe07b8a8ffbd36018871a17458903fc25c9d015f37b54513a3efd"}, - {file = "Cython-0.29.23-cp39-cp39-manylinux1_i686.whl", hash = "sha256:266459c7e48fe3c6c492b297e4033e42d4c6863cc1a1ff7cc4034949fc574fa6"}, - {file = "Cython-0.29.23-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:4b6824b58d4373224fc76ee8bee6b35c2d17c91a1ed0fa67b88440f63daebe50"}, - {file = "Cython-0.29.23-cp39-cp39-win32.whl", hash = "sha256:7d6a33c8a11f05f698e215bfdb837f32c27f63c20f3af863557ed91c748dc2be"}, - {file = "Cython-0.29.23-cp39-cp39-win_amd64.whl", hash = "sha256:2365f3b5e6451b6bc6dcd262230656f4ade1d862ec2f6c22154deebef37c08b6"}, - {file = "Cython-0.29.23-py2.py3-none-any.whl", hash = "sha256:282263628c5d601b313d5920f7b6d7e08c7fedbddacd080c4858aa04d86b6b4b"}, - {file = "Cython-0.29.23.tar.gz", hash = "sha256:6a0d31452f0245daacb14c979c77e093eb1a546c760816b5eed0047686baad8e"}, -] dask = [ {file = "dask-2021.3.0-py3-none-any.whl", hash = "sha256:9943b58215090ec388250c572ba4484b9e0f3e57092c3045871f79e69b3c8658"}, {file = "dask-2021.3.0.tar.gz", hash = "sha256:566054b493d63c15732f2a640382b21e861571d61639f59341bc7695a9be138e"}, @@ -1615,12 +1569,12 @@ filelock = [ {file = "filelock-3.0.12.tar.gz", hash = "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59"}, ] fsspec = [ - {file = "fsspec-2021.4.0-py3-none-any.whl", hash = "sha256:70dae1d8d51072c4a1196acb9ba1bf8f5b9cdd83c4bb67e8a31dac604a49594b"}, - {file = "fsspec-2021.4.0.tar.gz", hash = "sha256:8b1a69884855d1a8c038574292e8b861894c3373282d9a469697a2b41d5289a6"}, + {file = "fsspec-2021.5.0-py3-none-any.whl", hash = "sha256:a08daa28ecb98544350d62f8307a7678e410c4b5245e04d14fcf4a3f13e959a7"}, + {file = "fsspec-2021.5.0.tar.gz", hash = "sha256:30064d89b00c6acfe5d8e027b6c0ef3df25ca17fd1d5cad2e8208c9aa5300f8d"}, ] gcsfs = [ - {file = "gcsfs-2021.4.0-py2.py3-none-any.whl", hash = "sha256:15f656f2da510c6bb872c589b5a0e6fac11e12a9b616b820c5ba752b794cfc5d"}, - {file = "gcsfs-2021.4.0.tar.gz", hash = "sha256:8503b59d7afc98fdddbba67a615010982cf1c737d5c481d67235a47a5fcc043f"}, + {file = "gcsfs-2021.5.0-py2.py3-none-any.whl", hash = "sha256:c8ed85f13d97efc8b055d2b5abc3528ced98fc288a7858c23bf6b197da55a6ba"}, + {file = "gcsfs-2021.5.0.tar.gz", hash = "sha256:f3b54c49f85b5cfb80f34e63c69680493405fc59b0d3f1d466e4400efed93c2b"}, ] google-auth = [ {file = "google-auth-1.30.0.tar.gz", hash = "sha256:9ad25fba07f46a628ad4d0ca09f38dcb262830df2ac95b217f9b0129c9e42206"}, @@ -2180,7 +2134,23 @@ rsa = [ {file = "rsa-4.7.2.tar.gz", hash = "sha256:9d689e6ca1b3038bc82bf8d23e944b6b6037bc02301a574935b2dd946e0353b9"}, ] scikit-allel = [ - {file = "scikit-allel-1.3.3.tar.gz", hash = "sha256:550c2a1d00953c3d5d54eb128faf38b52ebd7a8717c1a6dc8ec711cdcada8ded"}, + {file = "scikit-allel-1.3.5.tar.gz", hash = "sha256:f2a1fdcd00880fbea6e0288dee5b48b4bcd8730b3fa631ff2b27b4679e45e8ed"}, + {file = "scikit_allel-1.3.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:6150cda81d0b3d09767ea27f7344c0eb6e02ca1d1d64d6b930511d2758145ecb"}, + {file = "scikit_allel-1.3.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:437b888ea48255fce788e4bc3c717d33c00125f60ec1c727399959464028793b"}, + {file = "scikit_allel-1.3.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:cc02bdbaf252d360362496c7cffb69fd8e261f1397c03bf524b9ad37031da123"}, + {file = "scikit_allel-1.3.5-cp36-cp36m-win_amd64.whl", hash = "sha256:befa08cf2ecdbc3b256570007e846643ecf3838050425696ec502cf6591ac8e9"}, + {file = "scikit_allel-1.3.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b3249db4e557820bcf8cb1b912e9cd4000c8ab02eec7d90b3da8ca0d00e887bc"}, + {file = "scikit_allel-1.3.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:c615c825edc19f40e0e413db34f90dee50fead46bc7ed380c89098619ca2bff7"}, + {file = "scikit_allel-1.3.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:21c10744d306b36567075f738b2549ce6d20994ab16d8162f70f2278a6db84ba"}, + {file = "scikit_allel-1.3.5-cp37-cp37m-win_amd64.whl", hash = "sha256:52d9f17b74e204c0d6a01fc16b03d3d9d95feae9ec147b90c189c4f344e290e2"}, + {file = "scikit_allel-1.3.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5b031c79139d1cc793771692108440a72052d834065cf9d45375c6917cee8988"}, + {file = "scikit_allel-1.3.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:680485ae4bc9fa753740055070fa2da39a8875023e549da3946d9bd4372353f6"}, + {file = "scikit_allel-1.3.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ebb52e7f07bf35aef42879883f2fe568855242c5cfc0154c0925444fe51f0592"}, + {file = "scikit_allel-1.3.5-cp38-cp38-win_amd64.whl", hash = "sha256:8658eef2306702338852dec10d96e9ef165ca6c54708c6356ecffc51e0f4b136"}, + {file = "scikit_allel-1.3.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3a7fb5cb460dfb532b41c99f3d004b3533f9d6353f9d2a24711f1a7a224f674c"}, + {file = "scikit_allel-1.3.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9a222c684868fb2cd91164ae472a4b78912c00bfc5eb84876f84635f829251c4"}, + {file = "scikit_allel-1.3.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:f951db48bdaa94df437c82d23dbe0ebdbd94e54890e5aa4cb4c4b11743b10795"}, + {file = "scikit_allel-1.3.5-cp39-cp39-win_amd64.whl", hash = "sha256:5eafb694faa891668934cc0402af91e3b358c7202ee9ad554b844bc5049f6b12"}, ] scipy = [ {file = "scipy-1.5.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:4f12d13ffbc16e988fa40809cbbd7a8b45bc05ff6ea0ba8e3e41f6f4db3a9e47"}, diff --git a/tests/test_ag3.py b/tests/test_ag3.py index d67453446..d65c91bef 100644 --- a/tests/test_ag3.py +++ b/tests/test_ag3.py @@ -585,7 +585,7 @@ def test_snp_effects(): def test_snp_allele_frequencies(): ag3 = setup_ag3() - populations = { + cohorts = { "ke": "country == 'Kenya'", "bf_2012_col": "country == 'Burkina Faso' and year == 2012 and species == 'coluzzii'", } @@ -600,7 +600,7 @@ def test_snp_allele_frequencies(): # drop invariants df = ag3.snp_allele_frequencies( transcript="AGAP009194-RA", - populations=populations, + cohorts=cohorts, site_mask="gamb_colu", sample_sets="v3_wild", drop_invariant=True, @@ -618,7 +618,7 @@ def test_snp_allele_frequencies(): # check invariant have been dropped assert df.max_af.min() > 0 - populations = { + cohorts = { "gm": "country == 'Gambia, The'", "mz": "country == 'Mozambique' and year == 2004", } @@ -633,7 +633,7 @@ def test_snp_allele_frequencies(): # keep invariants df = ag3.snp_allele_frequencies( transcript="AGAP004707-RD", - populations=populations, + cohorts=cohorts, site_mask="gamb_colu", sample_sets="v3_wild", drop_invariant=False, @@ -982,7 +982,7 @@ def test_gene_cnv(contig, sample_sets): @pytest.mark.parametrize("contig", ["2R", "X"]) def test_gene_cnv_frequencies(contig): ag3 = setup_ag3() - populations = { + cohorts = { "ke": "country == 'Kenya'", "bf_2012_col": "country == 'Burkina Faso' and year == 2012 and species == 'coluzzii'", } @@ -1000,9 +1000,7 @@ def test_gene_cnv_frequencies(contig): ] df_genes = ag3.geneset().query(f"type == 'gene' and contig == '{contig}'") - df = ag3.gene_cnv_frequencies( - contig=contig, sample_sets="v3_wild", populations=populations - ) + df = ag3.gene_cnv_frequencies(contig=contig, sample_sets="v3_wild", cohorts=cohorts) assert isinstance(df, pd.DataFrame) assert expected_cols == df.columns.tolist()