This repository was archived by the owner on Mar 12, 2022. It is now read-only.
forked from rvolden/Mandalorion-Episode-II
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdefineAndQuantifyIsoforms.py
249 lines (210 loc) · 9.03 KB
/
defineAndQuantifyIsoforms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env python3
# Christopher Vollmers
# Roger Volden
import os
import sys
import numpy as np
path = sys.argv[2]
content_file = sys.argv[1]
upstream_buffer = int(sys.argv[4])
downstream_buffer = int(sys.argv[3])
minimum_read_count = 1
def find_peaks(starts, ends):
start_peaks = {}
end_peaks = {}
for position in sorted(starts):
if list(starts).count(position) >= minimum_read_count:
if not start_peaks.get(position):
for shift in np.arange(-upstream_buffer, downstream_buffer):
start_peaks[position+shift] = position
for position in sorted(ends, reverse=True):
if list(ends).count(position) >= minimum_read_count:
if not end_peaks.get(position):
for shift in np.arange(-downstream_buffer, upstream_buffer, 1):
end_peaks[position+shift] = position
return start_peaks, end_peaks
def collect_splice_events(path):
splice_dict = {}
for line in open(path + '/SS.bed'):
a = line.strip().split('\t')
chromosome = a[0]
if not splice_dict.get(chromosome):
splice_dict[chromosome] = {}
splice_left =int(a[1])
splice_right = int(a[2])
for base in np.arange(splice_left, splice_right+1):
splice_dict[chromosome][base] = a[3].split('_')[0]
return splice_dict
def sort_reads_into_splice_junctions(content_file, splice_dict,
fasta_file, infile):
start_end_dict = {}
readDict = {}
tempSeqs, headers, sequences = [], [], []
for line in open(fasta_file):
line = line.rstrip()
if not line:
continue
if line.startswith('>'):
headers.append(line.split()[0][1:])
# covers the case where the file ends while reading sequences
if line.startswith('>'):
sequences.append(''.join(tempSeqs).upper())
tempSeqs = []
else:
tempSeqs.append(line)
# fasta_file.close()
sequences.append(''.join(tempSeqs).upper())
sequences = sequences[1:]
for i in range(len(headers)):
readDict[headers[i].split('_')[0]] = [headers[i], sequences[i]]
read_dict = readDict
print(len(read_dict))
for line in open(infile):
a = line.strip().split('\t')
chromosome, read_direction = a[13], a[8]
name = a[9].split('_')[0]
read_direction = '+' ### Ignores read direction
start, end = int(a[15]), int(a[16])
if read_direction == '+':
left_extra, right_extra = int(a[11]), int(a[10]) - int(a[12])
if read_direction == '-':
right_extra, left_extra = int(a[11]), int(a[10]) - int(a[12])
failed = False
identity = chromosome + '_'
blocksizes = a[18].split(',')[:-1]
blockstarts = a[20].split(',')[:-1]
readstarts = a[19].split(',')[:-1]
for x in range(0, len(blocksizes)-1):
blockstart = int(blockstarts[x])
blocksize = int(blocksizes[x])
left_splice = blockstart + blocksize
right_splice = int(blockstarts[x+1])
if right_splice - left_splice > 50:
try:
left_splice_site = splice_dict[chromosome][left_splice]
except:
failed = True
try:
right_splice_site = splice_dict[chromosome][right_splice]
except:
failed = True
if not failed:
identity += str(left_splice_site) + '-' \
+ str(right_splice_site) + '~'
if '5r133331' in identity:
print(identity,failed)
if not failed:
if not start_end_dict.get(identity):
start_end_dict[identity] = []
start_end_dict[identity].append((start, end,
'>' + read_dict[name][0] + '\n'
+ read_dict[name][1] + '\n',
left_extra,
right_extra,
read_direction))
return start_end_dict
def define_start_end_sites(start_end_dict, individual_path, subreads):
left_extras, right_extras = {}, {}
file_set = set()
isoform_counter, isoform_dict = 0, {}
for identity in start_end_dict:
starts=[]
ends=[]
positions = start_end_dict[identity]
print(identity, len(positions))
for position in positions:
starts.append(int(position[0]))
ends.append(int(position[1]))
start_dict, end_dict =find_peaks(starts, ends)
matched_positions = []
combination_counts = {}
left_extras[identity], right_extras[identity] = {}, {}
for start, end, read, left_extra, right_extra, read_direction in positions:
try:
left = start_dict[int(start)]
right = end_dict[int(end)]
if not left_extras[identity].get((left, right)):
left_extras[identity][(left,right)] = []
right_extras[identity][(left,right)] = []
left_extras[identity][(left, right)].append(int(left_extra))
right_extras[identity][(left, right)].append(int(right_extra))
if not combination_counts.get((left, right)):
combination_counts[(left, right)] = 1
else:
combination_counts[(left, right)] += 1
matched_positions.append((left, right, read, read_direction))
except:
pass
for left, right, read, read_direction in matched_positions:
medianLeft = np.median(left_extras[identity][(left, right)])
medianRight = np.median(right_extras[identity][(left, right)])
new_identity = identity + '_' + str(left) \
+ '_' + str(right) + '_' \
+ str(round(medianLeft, 2)) \
+ '_' + str(round(medianRight, 2))
if not isoform_dict.get(new_identity):
isoform_counter += 1
isoform_dict[new_identity] = isoform_counter
subfolder = str(int(isoform_dict[new_identity]/4000))
if subfolder not in os.listdir(individual_path+'/parsed_reads/'):
os.makedirs(individual_path +'/parsed_reads/'+ subfolder)
filename = subfolder + '/Isoform' + str(isoform_dict[new_identity])
out_reads_fasta = open(individual_path + '/parsed_reads/'
+ filename + '.fasta', 'a')
out_reads_subreads = open(individual_path + '/parsed_reads/'
+ filename + '_subreads.fastq', 'a')
out_reads_fasta.write(read)
out_reads_fasta.close()
file_set.add(individual_path + '/parsed_reads/' + filename
+ '.fasta' + '\t' + individual_path
+ '/parsed_reads/' + filename + '_subreads.fastq'
+ '\t' + new_identity + '\n')
read = read.split('\n')[0][1:]
subread_list = subreads[read]
for subread, sequence, qual in subread_list:
out_reads_subreads.write(subread + '\n' + sequence
+ '\n+\n' + qual +'\n')
out_reads_subreads.close()
out = open(individual_path + 'isoform_list', 'w')
for item in file_set:
out.write(item)
out.close()
def read_subreads(seq_file, infile):
read_seq = {}
length = 0
for line2 in open(seq_file):
length += 1
seq_file_open = open(seq_file,'r')
counter = 0
for line in open(infile):
name = line.strip().split('\t')[9].split('_')[0]
read_seq[name] = []
while counter < length:
name = seq_file_open.readline().strip()
seq = seq_file_open.readline().strip()
plus = seq_file_open.readline().strip()
qual = seq_file_open.readline().strip()
root_name = name[1:].split('_')[0]
try:
read_seq[root_name].append((name, seq, qual))
except:
pass
counter += 4
return read_seq
def main():
for line in open(content_file):
b = line.strip().split('\t')
print(b)
infile = b[0]
fasta_file = b[1]
individual_path = b[2]
subreads_file = b[3]
os.system('mkdir ' + individual_path + '/parsed_reads')
os.system('rm -r ' + individual_path + '/parsed_reads/*')
subreads = read_subreads(subreads_file, infile)
splice_dict = splice_dict = collect_splice_events(path)
start_end_dict = sort_reads_into_splice_junctions(content_file, splice_dict,
fasta_file, infile)
define_start_end_sites(start_end_dict, individual_path, subreads)
if __name__ == '__main__':
main()