-
Notifications
You must be signed in to change notification settings - Fork 0
/
Antao_filter
121 lines (102 loc) · 2.79 KB
/
Antao_filter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
Variant filtering code.
from __future__ import division
import os
import pickle
import sys
import vcf
from vcf import model
vcfin = sys.argv[1]
#Parameter is the VCF file
dps = {}
samples = {}
def accept_sample(chro, smp):
if smp[“DP”] < 14:
return False
if smp["GQ"] < 40:
return False
median = dps[smp.sample][chro]
if smp["DP"] < median / 2 or smp["DP"] > median * 2:
return False
return True
def accept_rec(rec):
if rec.CHROM not in ["2L", "2R", "3L", "3R", "X"]:
return False
if not rec.is_snp:
return False
if len(rec.ALT) != 1:
return False
if rec.INFO["MQ"] < 40:
return False
if rec.INFO["QD"] < 5:
return False
if rec.INFO["HRun"] > 3:
return False
return True
def get_dp(vcfin):
global dps
pic = "filter.dp"
if os.path.exists(pic):
dps = pickle.load(open(pic))
return
f = vcf.Reader(filename=vcfin)
rec = f.next()
samples = [x.sample for x in rec.samples]
cnt = {}
cnt["2L"] = 0
cnt["2R"] = 0
cnt["3L"] = 0
cnt["3R"] = 0
cnt["X"] = 0
for sample in samples:
dps[sample] = {}
dps[sample]["2L"] = {}
dps[sample]["2R"] = {}
dps[sample]["3L"] = {}
dps[sample]["3R"] = {}
dps[sample]["X"] = {}
f = vcf.Reader(filename=vcfin)
c = 0
for rec in f:
if not accept_rec(rec):
continue
cnt[rec.CHROM] += 1
for smp in rec.samples:
sample = smp.sample
dp = smp["DP"]
dps[sample][rec.CHROM][dp] = dps[sample][rec.CHROM].get(dp, 0) + 1
c += 1
if c % 10000 == 0:
print c, rec.CHROM, rec.POS
for sample in samples:
for chrom in ["2L", "2R", "3L", "3R", "X"]:
vals = []
my_dps = dps[sample][chrom].keys()
my_dps.sort()
for my_dp in my_dps:
vals.extend([my_dp] * dps[sample][chrom][my_dp])
if len(vals) > 0:
dps[sample][chrom] = vals[len(vals) // 2]
pickle.dump(dps, open(pic, "w"))
def do_vcf(vcfin):
f = vcf.Reader(filename=vcfin)
w = vcf.Writer(open("out.vcf", "w"), f, lineterminator="\n")
cnt = 0
for rec in f:
if not accept_rec(rec):
continue
has_data = False
for smp in rec.samples:
if not accept_sample(rec.CHROM, smp):
empty = ['GT', 'AD', 'DP', 'GQ', 'MQ0', 'PL']
smp.data = model.make_calldata_tuple(empty)(None, None, None, None, None, None)
else:
has_data = True
if has_data:
w.write_record(rec)
cnt += 1
if cnt % 10000 == 0:
print cnt, rec.CHROM, rec.POS
w.close()
get_dp(vcfin)
print dps
do_vcf(vcfin)