-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcollect_purity_ploidy.py
111 lines (99 loc) · 3.51 KB
/
collect_purity_ploidy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 4 15:43:25 2017
@author: lpsmith
"""
#Take *all* BAF and CN data and create expands input.
from __future__ import division
from os import walk
from os import path
from os import mkdir
from os.path import isfile
from random import shuffle
import lucianSNPLibrary as lsl
gdir = "gamma_test_output/pASCAT_input_"
outfile = "purities_and_ploidies.tsv"
def getOddsAndPloidies():
oddsfile = "calling_evidence_challenge_inc_odds.tsv"
oddsAndPloidies = {}
for line in open(oddsfile, "r"):
if "Patient" in line:
continue
lvec = line.rstrip().split("\t")
(patient, sample) = lvec[0:2]
if patient not in oddsAndPloidies:
oddsAndPloidies[patient] = {}
#first check if the hutch made a decision about this sample:
odds = float(lvec[-2])
ploidy = lvec[-1]
if ploidy == "Unknown":
if odds >= 0.5:
ploidy = "Diploid"
else:
ploidy = "Tetraploid"
oddsAndPloidies[patient][sample] = (odds, ploidy)
return oddsAndPloidies
def getPloidies(gdir, patient, ploidies):
fname = gdir + patient + "_fcn_ascat_ploidy.txt"
if not isfile(fname):
return
pfile = open(fname, "r")
for line in pfile:
if "x" in line:
continue
(__, sample, ploidy) = line.split('"')
ploidy = float(ploidy)
sample = sample.split("_")[1]
ploidies[sample] = ploidy
def getPurities(gdir, patient, purities):
fname = gdir + patient + "_fcn_ascat_cont.txt"
if not isfile(fname):
return
pfile = open(gdir + patient + "_fcn_ascat_cont.txt", "r")
for line in pfile:
if "x" in line:
continue
(__, sample, purity) = line.split('"')
purity = float(purity)
sample = sample.split("_")[1]
purities[sample] = purity
def writeHeaders(out):
out.write("Patient")
out.write("\tSample")
out.write("\tOdds diploid better")
out.write("\tPurity")
out.write("\tPloidy")
out.write("\n")
out = open(outfile, "w")
writeHeaders(out)
ploidies = {}
purities = {}
oddsAndPloidies = getOddsAndPloidies()
for patient in oddsAndPloidies:
ploidies[patient] = {}
purities[patient] = {}
for constraint in ["diploid", "tetraploid"]:
ploidies[patient][constraint] = {}
purities[patient][constraint] = {}
gamma = "g" + lsl.getGammaFor(patient)
ddir = gdir + gamma + "/diploid/"
tdir = gdir + gamma + "/tetraploid/"
getPloidies(ddir, patient, ploidies[patient]["diploid"])
getPurities(ddir, patient, purities[patient]["diploid"])
getPloidies(tdir, patient, ploidies[patient]["tetraploid"])
getPurities(tdir, patient, purities[patient]["tetraploid"])
for sample in oddsAndPloidies[patient]:
out.write(patient)
out.write("\t" + sample)
out.write("\t" + str(oddsAndPloidies[patient][sample][0]))
if oddsAndPloidies[patient][sample][1] == "Diploid":
out.write("\t" + str(purities[patient]["diploid"][sample]))
out.write("\t" + str(ploidies[patient]["diploid"][sample]))
elif oddsAndPloidies[patient][sample][1] == "Tetraploid":
out.write("\t" + str(purities[patient]["tetraploid"][sample]))
out.write("\t" + str(ploidies[patient]["tetraploid"][sample]))
else:
print("Unknown solution:", oddsAndPloidies[patient][sample][1], "for", patient, sample)
assert(False)
out.write("\n")