-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_ploidy_estimates_from_pASCAT.py
197 lines (168 loc) · 7 KB
/
create_ploidy_estimates_from_pASCAT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 12 10:50:09 2017
@author: lpsmith
"""
from __future__ import division
from os import walk
import numpy
import lucianSNPLibrary as lsl
#Use this value to set up whether to use the 'rejoined' segments or not
flow_file = "FlowDataForChallenge.txt"
ploidy_key = "ploidy"
purity_key = "cont"
psi_key = "psi"
goodness_key = "goodness"
unconstrained_key = "unconstrained"
diploid_key = "diploid"
tetraploid_key = "tetraploid"
for tag in ["g250_diploid"]:#["combined_avSNPs", "combined_avSNPs_only1M", "combined_avSNPs_only25M"]:
print tag
root_dir = "pASCAT_output_" + tag + "/"
outfile = "joint_summary_" + tag + ".txt"
print outfile
ASCATvals = [ploidy_key, purity_key, psi_key, goodness_key]
constraints = [unconstrained_key, diploid_key, tetraploid_key]
values = {}
patient_samples = set()
for aval in ASCATvals:
values[aval] = {}
for constraint in constraints:
values[aval][constraint] = {}
for constraint in constraints:
files = []
for (__, __, f) in walk(root_dir + constraint + "/"):
files = f
for f in files:
for ASCATval in ASCATvals:
if f.find(ASCATval) != -1:
(patient, __, __, __) = f.split("_")
infile = open(root_dir + constraint + "/" + f, "r")
for line in infile:
if line.find("x") != -1:
continue
lvec = line.split('"')
(patient, sample) = lvec[1].split("_")
val = float(lvec[2])
values[ASCATval][constraint][(patient, sample)] = val
patient_samples.add((patient, sample))
patient_samples = sorted(patient_samples)
flow_data = {}
flowfile = open(flow_file, "r")
for line in flowfile:
if line.find("Random") != -1:
continue
(patient, __, __, __, __, __, bxg2, a1Ploidy, percentA1, a2Ploidy, percentA2, __) = line.split("\t")
if patient not in flow_data:
flow_data[patient] = [0, 0, [], set()]
if a1Ploidy == "":
flow_data[patient][0] += 1
flow_data[patient][2].append(0)
flow_data[patient][3].add(2.0)
else:
flow_data[patient][3].add(float(a1Ploidy))
if percentA1 != "":
flow_data[patient][1] += 1
percentA1 = float(percentA1)
if (percentA2 != ""):
percentA1 += float(percentA2)
flow_data[patient][2].append(percentA1)
if a2Ploidy != "":
flow_data[patient][3].add(float(a2Ploidy))
for patient in flow_data:
flow_data[patient][2] = numpy.average(flow_data[patient][2])
def hasCloseEntryIn(key, values):
for value in values:
if abs(key-value) < 0.2:
return True
return False
uncon_key = "unconstrained matches"
four_key = "exactly four"
#bpurity_key = "best purity"
dclose_key = "has close to diploid entry"
tclose_key = "has close to tetraploid entry"
ndreads = "diploid:non-diploid flows"
goodEv_key = "goodness diff"
evidence_types = [uncon_key, four_key, dclose_key, tclose_key, ndreads, goodEv_key]
evidence = {}
for etype in evidence_types:
evidence[etype] = {}
for p_s in patient_samples:
#Evidence that the unconstrained run matches one of the other runs
if p_s in values[ploidy_key][unconstrained_key] and p_s in values[ploidy_key][diploid_key]:
if values[ploidy_key][unconstrained_key][p_s] == values[ploidy_key][diploid_key][p_s]:
evidence[uncon_key][p_s] = diploid_key
if p_s in values[ploidy_key][unconstrained_key] and p_s in values[ploidy_key][tetraploid_key]:
if values[ploidy_key][unconstrained_key][p_s] == values[ploidy_key][tetraploid_key][p_s]:
evidence[uncon_key][p_s] = tetraploid_key
#Evidence that the tetraploid result is exactly 4:
if p_s in values[ploidy_key][tetraploid_key]:
val = values[ploidy_key][tetraploid_key][p_s]
if val < 4.1 and val > 3.9:
evidence[four_key][p_s] = diploid_key
else:
evidence[four_key][p_s] = tetraploid_key
#Evidence for the best purity:
# uval = values[purity_key][unconstrained_key][p_s]
# dval = 0
# tval = 0
# best = unconstrained_key
# if p_s in values[purity_key][diploid_key]:
# dval = values[purity_key][diploid_key][p_s]
# if p_s in values[purity_key][tetraploid_key]:
# tval = values[purity_key][tetraploid_key][p_s]
# if dval >= uval and dval > tval:
# best = diploid_key
# elif tval >= uval and tval > dval:
# best = tetraploid_key
# elif tval >= uval and tval == dval:
# best = "equal"
# evidence[bpurity_key][p_s] = best
#Evidence for particular ploidy showing up in flow data:
if p_s[0] in flow_data:
flow = flow_data[p_s[0]]
if p_s in values[ploidy_key][diploid_key]:
if hasCloseEntryIn(values[ploidy_key][diploid_key][p_s], flow[3]):
evidence[dclose_key][p_s] = "True"
else:
evidence[dclose_key][p_s] = "False"
if p_s in values[ploidy_key][tetraploid_key]:
if hasCloseEntryIn(values[ploidy_key][tetraploid_key][p_s], flow[3]):
evidence[tclose_key][p_s] = "True"
else:
evidence[tclose_key][p_s] = "False"
evidence[ndreads][p_s] = str(flow[0]) + "::"+str(flow[1])
else:
evidence[ndreads][p_s] = "0::0"
#Evidence: goodness of fit difference
if p_s in values[goodness_key][tetraploid_key]:
tetval = values[goodness_key][tetraploid_key][p_s]
if p_s in values[goodness_key][diploid_key]:
dipval = values[goodness_key][diploid_key][p_s]
evidence[goodEv_key][p_s] = str(dipval - tetval)
else:
evidence[goodEv_key][p_s] = "NA"
out = open(outfile, "w")
out.write("patient\tsample")
for constraint in constraints:
for ASCATval in ASCATvals:
out.write("\t" + constraint + "_" + ASCATval)
for etype in evidence_types:
out.write("\t" + etype)
out.write("\n")
for p_s in patient_samples:
out.write(p_s[0] + "\t" + p_s[1])
for constraint in constraints:
for ASCATval in ASCATvals:
if p_s in values[ASCATval][constraint]:
out.write("\t" + str(values[ASCATval][constraint][p_s]))
else:
out.write("\tNA")
for etype in evidence_types:
if p_s in evidence[etype]:
out.write("\t" + evidence[etype][p_s])
else:
out.write("\tNA")
out.write("\n")
out.close()