Skip to content

Commit

Permalink
add plink converyt function, faff around with some tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Tristan Dennis authored and Tristan Dennis committed Mar 26, 2024
1 parent fac3469 commit ea54da7
Show file tree
Hide file tree
Showing 6 changed files with 1,419 additions and 2 deletions.
237 changes: 237 additions & 0 deletions malariagen_data/anoph/test-1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'base_params' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 162\u001b[0m\n\u001b[1;32m 140\u001b[0m bed_reader\u001b[38;5;241m.\u001b[39mto_bed(\n\u001b[1;32m 141\u001b[0m filepath\u001b[38;5;241m=\u001b[39mbed_file_path,\n\u001b[1;32m 142\u001b[0m val\u001b[38;5;241m=\u001b[39mval,\n\u001b[1;32m 143\u001b[0m properties\u001b[38;5;241m=\u001b[39mproperties,\n\u001b[1;32m 144\u001b[0m count_A1\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 145\u001b[0m )\n\u001b[1;32m 147\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m plink_file_path\n\u001b[1;32m 149\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mbiallelic_snps_to_plink\u001b[39m(\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 151\u001b[0m results_dir,\n\u001b[1;32m 152\u001b[0m region: base_params\u001b[38;5;241m.\u001b[39mregions,\n\u001b[1;32m 153\u001b[0m n_snps: base_params\u001b[38;5;241m.\u001b[39mn_snps,\n\u001b[1;32m 154\u001b[0m thin_offset: base_params\u001b[38;5;241m.\u001b[39mthin_offset \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m,\n\u001b[1;32m 155\u001b[0m sample_sets: Optional[base_params\u001b[38;5;241m.\u001b[39msample_sets] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 156\u001b[0m sample_query: Optional[base_params\u001b[38;5;241m.\u001b[39msample_query] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 157\u001b[0m sample_indices: Optional[base_params\u001b[38;5;241m.\u001b[39msample_indices] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 158\u001b[0m site_mask: Optional[base_params\u001b[38;5;241m.\u001b[39msite_mask] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 159\u001b[0m min_minor_ac: Optional[base_params\u001b[38;5;241m.\u001b[39mmin_minor_ac] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 160\u001b[0m max_missing_an: Optional[base_params\u001b[38;5;241m.\u001b[39mmax_missing_an] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 161\u001b[0m random_seed: base_params\u001b[38;5;241m.\u001b[39mrandom_seed \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m42\u001b[39m,\n\u001b[0;32m--> 162\u001b[0m inline_array: base_params\u001b[38;5;241m.\u001b[39minline_array \u001b[38;5;241m=\u001b[39m \u001b[43mbase_params\u001b[49m\u001b[38;5;241m.\u001b[39minline_array_default,\n\u001b[1;32m 163\u001b[0m chunks: base_params\u001b[38;5;241m.\u001b[39mchunks \u001b[38;5;241m=\u001b[39m base_params\u001b[38;5;241m.\u001b[39mchunks_default,\n\u001b[1;32m 164\u001b[0m ):\n\u001b[1;32m 166\u001b[0m params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\n\u001b[1;32m 167\u001b[0m results_dir\u001b[38;5;241m=\u001b[39mresults_dir,\n\u001b[1;32m 168\u001b[0m region\u001b[38;5;241m=\u001b[39mregion,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 177\u001b[0m random_seed\u001b[38;5;241m=\u001b[39mrandom_seed,\n\u001b[1;32m 178\u001b[0m )\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m#for the sake of getting this going \u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m#and also because i expect sanjay and ali will rework most of this, i will forgo the type checks until later\u001b[39;00m\n",
"\u001b[0;31mNameError\u001b[0m: name 'base_params' is not defined"
]
}
],
"source": [
"import malariagen_data\n",
"from collections import Counter\n",
"from typing import Optional, Tuple\n",
"\n",
"import allel # type: ignore\n",
"import numpy as np\n",
"import os\n",
"import bed_reader\n",
"from dask.diagnostics import ProgressBar\n",
"# import bokeh.plotting\n",
"\n",
"#from .snp_data import AnophelesSnpData\n",
"#from ..util import hash_columns, check_types, CacheMiss\n",
"#from . import base_params\n",
"#from .base_params import DEFAULT\n",
"\n",
"def _create_plink_outfile(\n",
" self,\n",
" *,\n",
" results_dir,\n",
" region,\n",
" n_snps,\n",
" min_minor_ac,\n",
" thin_offset,\n",
" max_missing_an,\n",
"):\n",
" return f\"{results_dir}/{region}.{n_snps}.{min_minor_ac}.{thin_offset}.{max_missing_an}\"\n",
"\n",
"def _biallelic_snps_to_plink(\n",
" #_ means internal function\n",
" #loads biallelic diplotypes and selects segregating sites among them, converts to plink\n",
" self,\n",
" *,\n",
" results_dir,\n",
" region,\n",
" n_snps,\n",
" thin_offset,\n",
" sample_sets,\n",
" sample_query,\n",
" sample_indices,\n",
" site_mask,\n",
" min_minor_ac,\n",
" max_missing_an,\n",
" random_seed,\n",
" inline_array,\n",
" chunks,\n",
"):\n",
" \n",
" #first define output file\n",
" plink_file_path = self._create_plink_outfile(\n",
" results_dir=results_dir,\n",
" region=region,\n",
" n_snps=n_snps,\n",
" thin_offset=thin_offset,\n",
" min_minor_ac=min_minor_ac,\n",
" max_missing_an=max_missing_an,\n",
" )\n",
"\n",
" bed_file_path = f\"{plink_file_path}.bed\"\n",
" if os.path.exists(bed_file_path):\n",
" return plink_file_path\n",
"\n",
" #get snps\n",
" ds_snps = self.biallelic_snp_calls(\n",
" n_snps=n_snps,\n",
" region=region,\n",
" thin_offset=thin_offset,\n",
" sample_sets=sample_sets,\n",
" sample_query=sample_query,\n",
" sample_indices=sample_indices,\n",
" site_mask=site_mask,\n",
" min_minor_ac=min_minor_ac,\n",
" max_missing_an=max_missing_an,\n",
" random_seed=random_seed,\n",
" inline_array=inline_array,\n",
" chunks=chunks,\n",
" )\n",
"\n",
"\n",
" #filter snps on segregating sites\n",
" with self._spinner(\"Subsetting to segregating sites\"):\n",
" gt = ds_snps[\"call_genotype\"].data.compute()\n",
" print(\"count alleles\")\n",
" with ProgressBar():\n",
" ac = allel.GenotypeArray(gt).count_alleles(max_allele=3)\n",
" print(\"ascertain segregating sites\")\n",
" n_chroms = ds_snps.dims[\"samples\"] * 2\n",
" an_called = ac.sum(axis=1)\n",
" an_missing = n_chroms - an_called\n",
" min_ref_ac = min_minor_ac\n",
" max_ref_ac = n_chroms - min_minor_ac\n",
" # here we choose segregating sites\n",
" loc_sites = (\n",
" (ac[:, 0] >= min_ref_ac)\n",
" & (ac[:, 0] <= max_ref_ac)\n",
" & (an_missing <= max_missing_an)\n",
" )\n",
" print(f\"ascertained {np.count_nonzero(loc_sites):,} sites\")\n",
"\n",
"\n",
" #set up dataset with required vars for plink conversion\n",
" print(\"Set up dataset\")\n",
" ds_snps_asc = (\n",
" ds_snps[[\"variant_contig\", \"variant_position\", \"variant_allele\", \"sample_id\", \"call_genotype\"]]\n",
" .isel(alleles=slice(0, 2))\n",
" .sel(variants=loc_sites)\n",
" )\n",
"\n",
" #compute gt ref counts\n",
" with self._spinner(\"Computing genotype ref counts\"):\n",
" gn_ref = allel.GenotypeDaskArray(gt).to_n_ref(fill=-127) #what does the fill do\n",
" with ProgressBar():\n",
" gn_ref = gn_ref.compute()\n",
"\n",
" print(\"Ensure genotypes vary\")\n",
" loc_var = np.any(gn_ref != gn_ref[:, 0, np.newaxis], axis=1)\n",
" print(f\"final no. variants {np.count_nonzero(loc_var)}\")\n",
"\n",
"\n",
" print(\"Load final data\")\n",
" with ProgressBar():\n",
" ds_snps_final = (\n",
" ds_snps_asc[[\"variant_contig\", \"variant_position\", \"variant_allele\", \"sample_id\"]]\n",
" .isel(variants=loc_var)\n",
" )\n",
"\n",
" #init vars for input to bed reader\n",
" gn_ref_final = gn_ref[loc_var]\n",
" val = gn_ref_final.T\n",
" alleles = ds_snps_final[\"variant_allele\"].values\n",
" properties = {\n",
" \"iid\": ds_snps_final[\"sample_id\"].values,\n",
" \"chromosome\": ds_snps_final[\"variant_contig\"].values,\n",
" \"bp_position\": ds_snps_final[\"variant_position\"].values,\n",
" \"allele_1\": alleles[:, 0],\n",
" \"allele_2\": alleles[:, 1],\n",
" }\n",
"\n",
" print(f\"write plink files to {plink_file_path}\")\n",
" bed_reader.to_bed(\n",
" filepath=bed_file_path,\n",
" val=val,\n",
" properties=properties,\n",
" count_A1=True,\n",
" )\n",
"\n",
" return plink_file_path\n",
"\n",
"def biallelic_snps_to_plink(\n",
" self,\n",
" results_dir,\n",
" region: base_params.regions,\n",
" n_snps: base_params.n_snps,\n",
" thin_offset = 0,\n",
" sample_sets: Optional[base_params.sample_sets] = None,\n",
" sample_query: Optional[base_params.sample_query] = None,\n",
" sample_indices: Optional[base_params.sample_indices] = None,\n",
" site_mask: Optional[base_params.site_mask] = None,\n",
" min_minor_ac: Optional[base_params.min_minor_ac] = None,\n",
" max_missing_an: Optional[base_params.max_missing_an] = None,\n",
" random_seed: base_params.random_seed = 42,\n",
" inline_array: base_params.inline_array = base_params.inline_array_default,\n",
" chunks: base_params.chunks = base_params.chunks_default,\n",
"):\n",
" \n",
" params = dict(\n",
" results_dir=results_dir,\n",
" region=region,\n",
" n_snps=n_snps,\n",
" thin_offset=thin_offset,\n",
" sample_sets=sample_sets,\n",
" sample_query=sample_query,\n",
" sample_indices=sample_indices,\n",
" site_mask=site_mask,\n",
" min_minor_ac=min_minor_ac,\n",
" max_missing_an=max_missing_an,\n",
" random_seed=random_seed,\n",
" )\n",
" #for the sake of getting this going \n",
" #and also because i expect sanjay and ali will rework most of this, i will forgo the type checks until later\n",
"\n",
" filepath = self._biallelic_snps_to_plink(\n",
" inline_array=inline_array, chunks=chunks, **params\n",
" )\n",
" return filepath"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "malariagen_plink",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit ea54da7

Please sign in to comment.