-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add plink converyt function, faff around with some tests
- Loading branch information
Tristan Dennis
authored and
Tristan Dennis
committed
Mar 26, 2024
1 parent
fac3469
commit ea54da7
Showing
6 changed files
with
1,419 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,237 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "NameError", | ||
"evalue": "name 'base_params' is not defined", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", | ||
"Cell \u001b[0;32mIn[1], line 162\u001b[0m\n\u001b[1;32m 140\u001b[0m bed_reader\u001b[38;5;241m.\u001b[39mto_bed(\n\u001b[1;32m 141\u001b[0m filepath\u001b[38;5;241m=\u001b[39mbed_file_path,\n\u001b[1;32m 142\u001b[0m val\u001b[38;5;241m=\u001b[39mval,\n\u001b[1;32m 143\u001b[0m properties\u001b[38;5;241m=\u001b[39mproperties,\n\u001b[1;32m 144\u001b[0m count_A1\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 145\u001b[0m )\n\u001b[1;32m 147\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m plink_file_path\n\u001b[1;32m 149\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mbiallelic_snps_to_plink\u001b[39m(\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 151\u001b[0m results_dir,\n\u001b[1;32m 152\u001b[0m region: base_params\u001b[38;5;241m.\u001b[39mregions,\n\u001b[1;32m 153\u001b[0m n_snps: base_params\u001b[38;5;241m.\u001b[39mn_snps,\n\u001b[1;32m 154\u001b[0m thin_offset: base_params\u001b[38;5;241m.\u001b[39mthin_offset \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m 155\u001b[0m sample_sets: Optional[base_params\u001b[38;5;241m.\u001b[39msample_sets] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 156\u001b[0m sample_query: Optional[base_params\u001b[38;5;241m.\u001b[39msample_query] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 157\u001b[0m sample_indices: Optional[base_params\u001b[38;5;241m.\u001b[39msample_indices] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 158\u001b[0m site_mask: Optional[base_params\u001b[38;5;241m.\u001b[39msite_mask] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 159\u001b[0m min_minor_ac: Optional[base_params\u001b[38;5;241m.\u001b[39mmin_minor_ac] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 160\u001b[0m max_missing_an: Optional[base_params\u001b[38;5;241m.\u001b[39mmax_missing_an] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 161\u001b[0m random_seed: base_params\u001b[38;5;241m.\u001b[39mrandom_seed \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m42\u001b[39m,\n\u001b[0;32m--> 162\u001b[0m inline_array: base_params\u001b[38;5;241m.\u001b[39minline_array \u001b[38;5;241m=\u001b[39m \u001b[43mbase_params\u001b[49m\u001b[38;5;241m.\u001b[39minline_array_default,\n\u001b[1;32m 163\u001b[0m chunks: base_params\u001b[38;5;241m.\u001b[39mchunks \u001b[38;5;241m=\u001b[39m base_params\u001b[38;5;241m.\u001b[39mchunks_default,\n\u001b[1;32m 164\u001b[0m ):\n\u001b[1;32m 166\u001b[0m params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\n\u001b[1;32m 167\u001b[0m results_dir\u001b[38;5;241m=\u001b[39mresults_dir,\n\u001b[1;32m 168\u001b[0m region\u001b[38;5;241m=\u001b[39mregion,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 177\u001b[0m random_seed\u001b[38;5;241m=\u001b[39mrandom_seed,\n\u001b[1;32m 178\u001b[0m )\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m#for the sake of getting this going \u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m#and also because i expect sanjay and ali will rework most of this, i will forgo the type checks until later\u001b[39;00m\n", | ||
"\u001b[0;31mNameError\u001b[0m: name 'base_params' is not defined" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import malariagen_data\n", | ||
"from collections import Counter\n", | ||
"from typing import Optional, Tuple\n", | ||
"\n", | ||
"import allel # type: ignore\n", | ||
"import numpy as np\n", | ||
"import os\n", | ||
"import bed_reader\n", | ||
"from dask.diagnostics import ProgressBar\n", | ||
"# import bokeh.plotting\n", | ||
"\n", | ||
"#from .snp_data import AnophelesSnpData\n", | ||
"#from ..util import hash_columns, check_types, CacheMiss\n", | ||
"#from . import base_params\n", | ||
"#from .base_params import DEFAULT\n", | ||
"\n", | ||
"def _create_plink_outfile(\n", | ||
" self,\n", | ||
" *,\n", | ||
" results_dir,\n", | ||
" region,\n", | ||
" n_snps,\n", | ||
" min_minor_ac,\n", | ||
" thin_offset,\n", | ||
" max_missing_an,\n", | ||
"):\n", | ||
" return f\"{results_dir}/{region}.{n_snps}.{min_minor_ac}.{thin_offset}.{max_missing_an}\"\n", | ||
"\n", | ||
"def _biallelic_snps_to_plink(\n", | ||
" #_ means internal function\n", | ||
" #loads biallelic diplotypes and selects segregating sites among them, converts to plink\n", | ||
" self,\n", | ||
" *,\n", | ||
" results_dir,\n", | ||
" region,\n", | ||
" n_snps,\n", | ||
" thin_offset,\n", | ||
" sample_sets,\n", | ||
" sample_query,\n", | ||
" sample_indices,\n", | ||
" site_mask,\n", | ||
" min_minor_ac,\n", | ||
" max_missing_an,\n", | ||
" random_seed,\n", | ||
" inline_array,\n", | ||
" chunks,\n", | ||
"):\n", | ||
" \n", | ||
" #first define output file\n", | ||
" plink_file_path = self._create_plink_outfile(\n", | ||
" results_dir=results_dir,\n", | ||
" region=region,\n", | ||
" n_snps=n_snps,\n", | ||
" thin_offset=thin_offset,\n", | ||
" min_minor_ac=min_minor_ac,\n", | ||
" max_missing_an=max_missing_an,\n", | ||
" )\n", | ||
"\n", | ||
" bed_file_path = f\"{plink_file_path}.bed\"\n", | ||
" if os.path.exists(bed_file_path):\n", | ||
" return plink_file_path\n", | ||
"\n", | ||
" #get snps\n", | ||
" ds_snps = self.biallelic_snp_calls(\n", | ||
" n_snps=n_snps,\n", | ||
" region=region,\n", | ||
" thin_offset=thin_offset,\n", | ||
" sample_sets=sample_sets,\n", | ||
" sample_query=sample_query,\n", | ||
" sample_indices=sample_indices,\n", | ||
" site_mask=site_mask,\n", | ||
" min_minor_ac=min_minor_ac,\n", | ||
" max_missing_an=max_missing_an,\n", | ||
" random_seed=random_seed,\n", | ||
" inline_array=inline_array,\n", | ||
" chunks=chunks,\n", | ||
" )\n", | ||
"\n", | ||
"\n", | ||
" #filter snps on segregating sites\n", | ||
" with self._spinner(\"Subsetting to segregating sites\"):\n", | ||
" gt = ds_snps[\"call_genotype\"].data.compute()\n", | ||
" print(\"count alleles\")\n", | ||
" with ProgressBar():\n", | ||
" ac = allel.GenotypeArray(gt).count_alleles(max_allele=3)\n", | ||
" print(\"ascertain segregating sites\")\n", | ||
" n_chroms = ds_snps.dims[\"samples\"] * 2\n", | ||
" an_called = ac.sum(axis=1)\n", | ||
" an_missing = n_chroms - an_called\n", | ||
" min_ref_ac = min_minor_ac\n", | ||
" max_ref_ac = n_chroms - min_minor_ac\n", | ||
" # here we choose segregating sites\n", | ||
" loc_sites = (\n", | ||
" (ac[:, 0] >= min_ref_ac)\n", | ||
" & (ac[:, 0] <= max_ref_ac)\n", | ||
" & (an_missing <= max_missing_an)\n", | ||
" )\n", | ||
" print(f\"ascertained {np.count_nonzero(loc_sites):,} sites\")\n", | ||
"\n", | ||
"\n", | ||
" #set up dataset with required vars for plink conversion\n", | ||
" print(\"Set up dataset\")\n", | ||
" ds_snps_asc = (\n", | ||
" ds_snps[[\"variant_contig\", \"variant_position\", \"variant_allele\", \"sample_id\", \"call_genotype\"]]\n", | ||
" .isel(alleles=slice(0, 2))\n", | ||
" .sel(variants=loc_sites)\n", | ||
" )\n", | ||
"\n", | ||
" #compute gt ref counts\n", | ||
" with self._spinner(\"Computing genotype ref counts\"):\n", | ||
" gn_ref = allel.GenotypeDaskArray(gt).to_n_ref(fill=-127) #what does the fill do\n", | ||
" with ProgressBar():\n", | ||
" gn_ref = gn_ref.compute()\n", | ||
"\n", | ||
" print(\"Ensure genotypes vary\")\n", | ||
" loc_var = np.any(gn_ref != gn_ref[:, 0, np.newaxis], axis=1)\n", | ||
" print(f\"final no. variants {np.count_nonzero(loc_var)}\")\n", | ||
"\n", | ||
"\n", | ||
" print(\"Load final data\")\n", | ||
" with ProgressBar():\n", | ||
" ds_snps_final = (\n", | ||
" ds_snps_asc[[\"variant_contig\", \"variant_position\", \"variant_allele\", \"sample_id\"]]\n", | ||
" .isel(variants=loc_var)\n", | ||
" )\n", | ||
"\n", | ||
" #init vars for input to bed reader\n", | ||
" gn_ref_final = gn_ref[loc_var]\n", | ||
" val = gn_ref_final.T\n", | ||
" alleles = ds_snps_final[\"variant_allele\"].values\n", | ||
" properties = {\n", | ||
" \"iid\": ds_snps_final[\"sample_id\"].values,\n", | ||
" \"chromosome\": ds_snps_final[\"variant_contig\"].values,\n", | ||
" \"bp_position\": ds_snps_final[\"variant_position\"].values,\n", | ||
" \"allele_1\": alleles[:, 0],\n", | ||
" \"allele_2\": alleles[:, 1],\n", | ||
" }\n", | ||
"\n", | ||
" print(f\"write plink files to {plink_file_path}\")\n", | ||
" bed_reader.to_bed(\n", | ||
" filepath=bed_file_path,\n", | ||
" val=val,\n", | ||
" properties=properties,\n", | ||
" count_A1=True,\n", | ||
" )\n", | ||
"\n", | ||
" return plink_file_path\n", | ||
"\n", | ||
"def biallelic_snps_to_plink(\n", | ||
" self,\n", | ||
" results_dir,\n", | ||
" region: base_params.regions,\n", | ||
" n_snps: base_params.n_snps,\n", | ||
" thin_offset = 0,\n", | ||
" sample_sets: Optional[base_params.sample_sets] = None,\n", | ||
" sample_query: Optional[base_params.sample_query] = None,\n", | ||
" sample_indices: Optional[base_params.sample_indices] = None,\n", | ||
" site_mask: Optional[base_params.site_mask] = None,\n", | ||
" min_minor_ac: Optional[base_params.min_minor_ac] = None,\n", | ||
" max_missing_an: Optional[base_params.max_missing_an] = None,\n", | ||
" random_seed: base_params.random_seed = 42,\n", | ||
" inline_array: base_params.inline_array = base_params.inline_array_default,\n", | ||
" chunks: base_params.chunks = base_params.chunks_default,\n", | ||
"):\n", | ||
" \n", | ||
" params = dict(\n", | ||
" results_dir=results_dir,\n", | ||
" region=region,\n", | ||
" n_snps=n_snps,\n", | ||
" thin_offset=thin_offset,\n", | ||
" sample_sets=sample_sets,\n", | ||
" sample_query=sample_query,\n", | ||
" sample_indices=sample_indices,\n", | ||
" site_mask=site_mask,\n", | ||
" min_minor_ac=min_minor_ac,\n", | ||
" max_missing_an=max_missing_an,\n", | ||
" random_seed=random_seed,\n", | ||
" )\n", | ||
" #for the sake of getting this going \n", | ||
" #and also because i expect sanjay and ali will rework most of this, i will forgo the type checks until later\n", | ||
"\n", | ||
" filepath = self._biallelic_snps_to_plink(\n", | ||
" inline_array=inline_array, chunks=chunks, **params\n", | ||
" )\n", | ||
" return filepath" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "malariagen_plink", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.