-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4_hom_stats_parts.py
executable file
·260 lines (203 loc) · 10.4 KB
/
4_hom_stats_parts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#################################################
# USAGE
#
# Given a dataset, this code will compute the betti numbers for the point clouds created
# from the aCGH data from either all full arms or all sections of every arm and by patient.
# To speed up the calculations 2_cgh_dictionary_cytoband.R splits the the work into parts
# that can be run simultaneously.
#
# We utilize code from the Comptop group at Stanford (jplex) to compute the homology:
# http://comptop.stanford.edu/u/programs/jplex/index.html.
#
# Parameters: The epsilon increments for the filtration, epsIncr. The part number which
# refers to the proper dictionary file, partNum.
#
# Input: 1) The tab delimited transposed version of aCGH data arranged as mentioned in readme.docx.
# The dictionary file (from cgh_dictionary.R) which indicates the 0-based
# start and stop indices of the chromosome arms 2) Dictionary files
#
# Output:
# It creates two different kind of files:
# 1) Barcode files named Inter_2D_hom1_SET_ChromArm_patientNo_segNo.txt
# under ~/Research/Results/dataSet/action/2D/Homology/Chromosome_number (action: arms or sect)
# 2) Jagged files (B0_2D_SET_ChromArm_segNo.txt) with the value for the homology for each patient (row)
# at each increment for epsilon (columns). The Epsilon increment is saved for future reference in a
# text file named Epsilon_XXX.txt. This file and all the jagged files will be saved under:
# ~/Research/Results/SET/2D/Homology/B0_2D_SET_chrArm_seg.txt
import csv, math, sys, os, tempfile, subprocess, re, string
from functions_io import *
from functions_cgh import *
#################################################
# COMMAND LINE ARGUMENTS
# 1. dataSet (sj, climent, sim, etc.)
# 2. homDim (usually 1 or 2, which computes B0 or (B0 and B1), respectively.
# 3. partNum (a positive integer)
# 4. epsIncr (usually 0.01 or 0.05)
# 5. action (arms or sect) specify if you are running full arms or sections (as used in 2_cgh_dictionary_cytoband.R)
# note: dataSet will be expected within Research/dataSet but diccionaries within a subfolder named arms or sect
#
# EXAMPLE
# > python 4_hom_stats_parts.py set 1 2 0.01 action
dataSet = sys.argv[1]
homDim = int(sys.argv[2])
partNum = int(sys.argv[3])
epsIncr = round(float(sys.argv[4]), 3)
action = sys.argv[5]
#################################################
# FILE SPECIFIC FUNCTIONS
def makeDirectory(path):
if os.path.isdir(path):
return
os.makedirs(path)
#################################################
# BEGIN PROGRAM
# Determine base path
#begPath = os.path.join(os.getenv('HOME'), "Research")
# will take any starting point instead of HOME
current = os.getcwd()
begPath=os.path.abspath(os.path.join(current, os.pardir))
# Set the paths for the dictionary, and data files
dictFile = "%s_%s_dict_%d.txt" % (dataSet, action, partNum)
#dictPath = os.path.join(begPath, "Data",dataSet, dictFile)
dictPath = os.path.join(begPath, "Data", dataSet, action, dictFile)
dataFile = "%s_data.txt" % (dataSet)
dataPath = os.path.join(begPath, "Data", dataSet, dataFile)
# Read data and dictionary
inputList = readFile(dataPath, "\t", "float")
dictList = readFile(dictPath, "\t", "int")
# Get rid of the first two rows.
inputList.pop(0)
inputList.pop(0)
dictList.pop(0)
# Using dimension 2 as window
cloudDim = 2
# Establish the results directory
resultsPath = os.path.join(begPath, "Results", dataSet, action, repr(cloudDim)+"D", "Homology")
# Go through each row of the dictionary list to get all chromosome and arm combinations.
for row in dictList:
# Chromosome and arm assignments
chr = row[0]
arm = row[1]
# Beginning and ending assignments
beg = row[2]
end = row[3]
# Segment number
seg = row[5]
# Store the Betti numbers for all patients in a list containing as many lists as homDim - 1.
bettiNums = [[] for d in range(homDim)]
# Go through the individual samples.
for m in range(len(inputList)):
print "(Homology) On chromosome %d arm %s segment %d dimension %d and patient %d of %d" % (chr,arm, seg, cloudDim, (m+1), len(inputList))
#################################################
# Generate data for use in jPlex and run it.
#################################################
# Build the individual profile.
profile = build_individual_profile(inputList, m, beg, end)
# Build the cloud.
cloud = build_cloud(profile, cloudDim)
# Build the distance matrix from the cloud.
distanceMatrix = build_distance_matrix(cloud, cloudDim)
# Select as the largest epsilon, the largest distance divided by 3.
maxEps = max(map(max, distanceMatrix)) * 0.9
# Create the temporary file object which we'll reference now and again.
cloudFile = tempfile.NamedTemporaryFile(suffix = '.txt')
try:
# Write the cloud to the temporary file.
wTempTab(cloudFile, cloud)
# File to write the intervals. Writing is handled by beanshell.
makeDirectory(resultsPath + "/" + repr(chr))
intFile = "Inter_%dD_hom%d_%s_%d%s_pat%d_seg%d.txt" % (cloudDim, homDim, dataSet, chr, arm, (m+1), seg)
intPath = os.path.join(resultsPath, repr(chr), intFile)
# When you have time, try going back and using tempfile for this. Weird that you couldn't get it to work the first time.
# Create the bash file. The names will be unique so we can run in parallel with no problem.
bashName = "bash%d%s%d%d%d%d%s.bsh" % (chr, arm, beg, cloudDim, m, seg, dataSet)
bashFile = open(bashName, 'w')
bashFile.write('pcloud = Plex.EuclideanArrayData("'+cloudFile.name+'");\n')
bashFile.write('rips = Plex.RipsStream('+repr(epsIncr)+', '+repr(homDim)+', '+repr(maxEps)+', pcloud);\n')
bashFile.write('intervals = Plex.Persistence().computeIntervals(rips);\n')
bashFile.write('file = new FileOutputStream("'+intPath+'");\n')
bashFile.write('p = new PrintStream(file);\n')
bashFile.write('this.interpreter.setOut(p);\n')
bashFile.write('print(intervals);')
bashFile.close()
# Run jPlex with the temporary cloud and bash files. Need to wait() or we may delete the bash file before it runs.
plex = subprocess.Popen("java -Xmx1024m -cp plex.jar JTerm < "+bashFile.name, shell=True)
plex.wait()
# Remove the temporary bash file when we're done with it.
os.remove(bashName)
finally:
# Automatically clean up the cloud file
cloudFile.close()
#################################################
# Parse the interval output from jPlex
#################################################
# Open the homFile
homFile = open(intPath, 'r')
# Create regular expression object.
# ((?<=\[)\d) gives the dimension of the interval.
# (\d\.\d+) gives interval bounds (or starting point if it's inf)
pattern = re.compile("((?<=\[)\d)|(\d\.\d+)")
# Going to hold the cleaned up intervals
intervals = []
# Keep track of the maximum filtration time present
maxFilt = 0.0
# Go through each line of homFile and create an array: Dimension, Interval Start, Interval End
for line in homFile:
# Get the results of findall
result = pattern.findall(line)
tRow = []
# If the result is non-empty, we're going to want to get the stuff.
if len(result) != 0:
# This will always be the dimension
tRow.append(int(result[0][0]))
# This will always be the start of the interval
tRow.append(float(result[1][1]))
# This is the end of the interval
if len(result) == 3:
tRow.append(float(result[2][1]))
# Update maximum filtration time
if float(result[2][1]) > maxFilt:
maxFilt = float(result[2][1])
else:
tRow.append('inf')
intervals.append(tRow)
# Close homFile because we're done reading it. Delete file located at intPath (messy version).
homFile.close()
os.remove(intPath)
# Write to file located at intPath (clean version).
writeFile(intervals, intPath, "\t")
#################################################
# Collect incremental betti's from intervals
#################################################
# Store the betti numbers. This creates a list with homDim - 1 empty lists contained to hold the betti numbers.
indBettiNums = [[] for d in range(homDim)]
# Increment epsilons at the same incrementation when you made the bash script.
eps = epsIncr
# Go through eps = 0.005 until one increment after maxFilt
while (eps <= maxFilt + epsIncr):
# Store the temporary count of betti numbers
tBettis = [0 for d in range(homDim)]
# Go through the entries of the interval table and count if the epsilon is in the interval.
for entry in intervals:
if repr(entry[2]) == 'inf':
tBettis[entry[0]] = tBettis[entry[0]] + 1
elif entry[1] <= eps and eps <= entry[2]:
tBettis[entry[0]] = tBettis[entry[0]] + 1
# Append the counts for this epsilon
for i in range(len(indBettiNums)):
indBettiNums[i].append(tBettis[i])
eps = eps + epsIncr
# Append the patient's list of betti counts to the list containing all the patients (bettiNums)
for i in range(len(indBettiNums)):
bettiNums[i].append(indBettiNums[i])
# Write the betti counts for all the patients in this chromosome arm to a file.
for hDim in range(len(bettiNums)):
bettiPath = "%s/B%d_%dD_%s_%s_%d%s_seg%d.txt" % (resultsPath, hDim, cloudDim, dataSet, action, chr, arm, seg)
writeFile(bettiNums[hDim], bettiPath, "\t")
# Saving in a text file the epsilon increment used for the homology
EpsPath = "%s/Epsilon_%04.2f.txt" % (resultsPath, epsIncr)
EpsText = "Used epsilon=%04.2f" % (epsIncr)
makeDirectory(resultsPath)
EpsFile = open(EpsPath,"w")
EpsFile.write(EpsText)
EpsFile.close()