-
Notifications
You must be signed in to change notification settings - Fork 0
/
samplehaps.py
52 lines (43 loc) · 1.93 KB
/
samplehaps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os,sys,subprocess,random
import argparse
SHAPEIT="/home/user/shapeit/bin/shapeit"; ## set this path
## first line of output file has list of variant-ids (relative to VCF file)
def sample_haplotypes(VCF,nsamples):
HAPfile = VCF + '.hapsamples';
snptable = {}; variantid=0;
VCF_file = open(VCF);
for line in VCF_file:
if line[0]== '#': continue;
var = line.split();
chrom = var[0]; position = int(var[1]); allele1 = var[3]; alleles = var[4].split(','); allele2 = alleles[0];
genotype = var[9].split(':');
variantid +=1;
snptable[(chrom,position,allele1)] = [genotype[0],1,variantid];
VCF_file.close();
if nsamples > 1: F1 = open(HAPfile,'w'); # new output file
for r in range(nsamples):
seed = random.randint(1,10000000);
outfile = VCF + '.phased'; logfile = VCF + '.shapeit.log';
samplehap = SHAPEIT + ' -convert' + ' --seed ' + `r` + ' --input-graph ' + VCF + '.graph' + ' --output-sample ' + outfile + ' -L ' + logfile;
subprocess.call(samplehap,shell=True);
hap1 = []; hap2 = [];
File = open(outfile + '.haps','r');
for line in File:
var = line.strip().split(); h1 = var[5]; h2 = var[6];
if h1 == h2: continue; ## homozoygous variant
if r ==0:
varid = snptable[(var[0],int(var[2]),var[3])][2]
print >>F1, varid, ## variant-id
hap1.append(h1); hap2.append(h2);
File.close();
if r ==0: print >>F1,'\n',
if nsamples > 1: print >>F1, ''.join(hap1); ## only write one of the two haplotypes
if (r+1)%10==0: print >>sys.stderr,'iter',r+1;
if nsamples > 1: F1.close();
subprocess.call('rm -f ' + outfile + '.* ' + logfile,shell=True);
if len(sys.argv) < 3:
print >>sys.stderr, "program to sample haplotypes from SHAPEIT2 HMM ";
print >>sys.stderr, "set path to shapeit before running this program, the shapeit graph file also needs to exist ";
print >>sys.stderr, "usage: python samplehaps.py VCFfile number_haps(integer)";
sys.exit();
sample_haplotypes(sys.argv[1],int(sys.argv[2]));