-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSNP2Expands.py
161 lines (134 loc) · 5.68 KB
/
SNP2Expands.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding: utf-8 -*-
"""
Created on Wed May 11 17:58:38 2016
@author: lpsmith
"""
# This file reads in SNP array BAF data, corrects for dye bias, and writes
# it out again in EXPANDS format. There are basic filters that can be
# used, and it also can be filtered against the exome data. Files with
# no filter, the basic filters, and basic+exome are all written out.
dye_median = 0.5237 #from running median_calc.py; June 8th, 2016
blood_be_diff_threshold = 0.03 #from running FilterFinder.py; June 17; 2016
het_threshold = 0.18 #from running FilterFinder_blood_call.py; June 17, 2016
ex_snp_diff_threshold = 0.4 #from running FilterFinder_exome_diff.py; June 17, 2016
dbcorr = True
# read the probeset file, which correlates name to position.
infilename = "probe_set_build37_forpartek.txt"
infile = open(infilename,"r")
indata = infile.readlines()
infile.close()
samples = ["1047", "163", "222", "230", "312", "391", "611", "660", "729", "732", "824", "930"]
#samples = ["1047"]
labels = {}
rev_labels = {}
for line in indata:
line = line.rstrip().split()
if (len(line) == 3):
(id, chr, pos) = line[0:3]
if chr=="X":
chr = "23"
if chr=="Y":
chr = "24"
labels[id] = (chr, pos)
rev_labels[(chr, pos)] = id
for sample in samples:
#Read the SNP data file
SNPfilename = "SNP_data_orig/" + sample + "_BAFs"
infile = open(SNPfilename,"r")
indata = infile.readlines()
infile.close()
#Process the SNP data
SNPs = {}
labelrow = indata[0].rstrip().split("\t")
BErow = indata[1].rstrip().split("\t")
bloodrow = indata[2].rstrip().split("\t")
for col in range(len(labelrow)):
if labelrow[col] in labels:
SNPs[labels[labelrow[col]]] = (BErow[col], bloodrow[col])
#print labels[labelrow[col]]
#Process the exome data.
exomes = {}
exomefilename = "exome_data_reformatted/" + sample + "_full.baf"
infile = open(exomefilename, "r")
indata = infile.readlines()
infile.close()
for exline in indata:
exline = exline.rstrip().split("\t")
if exline[1] == "startpos":
continue
labelpair = (exline[0], exline[1])
exomes[labelpair] = (exline[3], exline[5])
#Now go through the SNP data and export different files for different filters
dbcorrstr = ""
if (dbcorr):
dbcorrstr = "_dbcorr"
outfn_0f = "SNP_data_filtered/" + sample + dbcorrstr + "_diff_filter.txt"
outfn_1f = "SNP_data_filtered/" + sample + dbcorrstr + "_basic_filters.txt"
outfn_2f = "SNP_data_filtered/" + sample + dbcorrstr + "_basic_and_exome_filters.txt"
outf_0f = open(outfn_0f, "w")
outf_1f = open(outfn_1f, "w")
outf_2f = open(outfn_2f, "w")
outline = "chr\tstartpos\tendpos\tAF_Tumor\tPN_B\tAF_Blood\n"
outf_0f.write(outline)
outf_1f.write(outline)
outf_2f.write(outline)
for label in SNPs.items():
(chr, pos) = label[0]
(SNP_BE, SNP_blood) = label[1]
if (SNP_BE == "?" or SNP_blood == "?"):
continue
SNP_BE = float(SNP_BE)
SNP_blood = float(SNP_blood)
snpid = rev_labels[label[0]]
if (snpid[0:3] != "cnv" and dbcorr):
#dye bias correction, BE:
if (SNP_BE < dye_median):
SNP_BE = SNP_BE * 0.5/dye_median
else:
SNP_BE = 0.5 + 0.5*(SNP_BE-dye_median)/(1-dye_median)
#dye bias correction, blood:
if (SNP_blood < dye_median):
SNP_blood = SNP_blood * 0.5/dye_median
else:
SNP_blood = 0.5 + 0.5*(SNP_blood-dye_median)/(1-dye_median)
#Call for whether blood is heterozygous or not:
ishet = "0"
if (SNP_blood > 0.25 and SNP_blood < 0.75):
ishet = "1"
#If the two are not significantly different, don't write it out anywhere.
if (abs(SNP_blood - SNP_BE) < blood_be_diff_threshold):
continue
#if the *called* blood frequency isn't significantly different from BE, don't write that out either.
if (ishet == "0"):
if (SNP_BE > 1-blood_be_diff_threshold or SNP_BE < blood_be_diff_threshold):
continue
else:
if (abs(SNP_BE-0.5) < blood_be_diff_threshold):
continue
#If need be, flip which allele is 'B' so that the blood is the lower frequency
if (SNP_blood > SNP_BE):
SNP_BE = 1-SNP_BE
SNP_blood = 1-SNP_blood
#Create the output line that we may write out:
outline = chr + "\t" + pos + "\t" + pos + "\t" + str(SNP_BE) + "\t" + ishet + "\t" + str(SNP_blood) + "\n"
#Write this out to the no-filter file
outf_0f.write(outline)
#if the blood cannot be called as heterozygous or homozygous, don't write that out to a filtered file:
if (SNP_blood > 0.0+het_threshold and SNP_blood < 0.5-het_threshold):
continue
if (SNP_blood > 0.5+het_threshold and SNP_blood < 1.0-het_threshold):
continue
#We're done with our first filter; write this out.
outf_1f.write(outline)
#now try to find the same data in the exome:
exome = exomes.get(label[0])
if (exome == None):
continue
(exomeBE, exomeBlood) = exome
exomeBE = float(exomeBE)
exomeBlood = float(exomeBlood)
#if the two measures signifiantly differ, don't write them out:
if (abs(SNP_BE - exomeBE) + abs(SNP_blood - exomeBlood) > ex_snp_diff_threshold):
continue
#This was our last filter; write it out.
outf_2f.write(outline)