-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassignment1_joris.py
324 lines (259 loc) · 10.9 KB
/
assignment1_joris.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#!/usr/bin/env python
from __future__ import division
"""
This script implements the Needleman-Wunsch algorithm
to align protein seqences
"""
__author__ = "Joris van Steenbrugge (950416798110)"
__email__ = " [email protected]"
import argparse
import resource
blosum = """
# http://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt
# Matrix made by matblas from blosum62.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 62
# Entropy = 0.6979, Expected = -0.5209
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
"""
def parse_arguments():
"""Return parse arguments object
Parses the separate sequences, end_gap penalty, regular gap penalty.
Only seqA and seqB are required as both gap penalties do have a default value.
"""
parser = argparse.ArgumentParser()
parser.add_argument("-seqA", dest = "seqA", required = True,
help = "The first protein sequence")
parser.add_argument("-seqB", dest = "seqB", required = True,
help = "The second protein sequence")
parser.add_argument("-gap", dest = "gap", required = False,
default = 4, type = int,
help = "The scoring penalty for introducing gaps in the alignment")
parser.add_argument("-end_gap_penalty", dest = "end_gap", required = False,
default = 2, type = int,
help = "The scoring penalty for introducing gaps at the start or beginning of the alignment")
return parser.parse_args()
def blosum62():
"""Return order and similarity scores from BLOSUM62 matrix
order: dict of {res: idx_in_matrix}
blosum_matrix: list of lists with similarity scores
"""
order = {}
blosum_matrix = []
for line in blosum.split('\n'):
if line.startswith('#'):
continue
if not line.strip():
continue
parts = line.strip().split()
if len(parts) == 24:
for idx, sym in enumerate(parts):
order[sym] = idx
else:
# list around the map construction for python3 compatibility
blosum_matrix.append(list(map(int,parts[1:])))
return order, blosum_matrix
BLOSUM62_ORDER, BLOSUM62_MATRIX = blosum62()
def score(res1, res2):
"""Return similarity score from BLOSUM62 matrix for two residues
Keyword Arguments:
res1 -- string, amino acid
res2 -- string, amino acid
"""
lookup1 = BLOSUM62_ORDER[res1]
lookup2 = BLOSUM62_ORDER[res2]
return BLOSUM62_MATRIX[lookup1][lookup2]
def score_at_position(i, j, sequences):
"""Returns the BLOSUM62 score of two residues at a specific position
Keyword Arguments:
i -- integer stringA index
j -- integer stringB index
sequences -- tuple reversely sorted on length
containing two protein sequences
"""
res_a = sequences[0][i - 1]
res_b = sequences[1][j - 1]
scr = score(res_a, res_b)
return scr
def gap_penalty(n, E):
"""Return the gap penalty for a certain length
Keyword Arguments:
n -- integer, index of gaps in a row
E -- integer, the gap penalty
"""
return ((-n) * E)
def create_empty_matrix(sequences, Nones= True):
"""Return
"""
m = len(sequences[0]) + 1
n = len(sequences[1]) + 1
if Nones:
return [[None for i in range(n)] for j in range(m)]
else:
return [ [ list() for i in range(n)] for j in range(m)]
def matrix_template(sequences, A = 2):
"""Fill the matrix's first row and first collumn with gap penalties.
Keyword Arguments:
sequences -- A tuple reversely sorted on length
containing two protein sequences
A -- End gap penalty score
Returns:
A matrix implemented as two dimensional array,
to be queried as matrix[row][col]
"""
#Create an empty matrix with the corresponding dimensions
matrix = create_empty_matrix(sequences)
matrix[0][0] = 0
#End/start gap penalties in each column
for i in range(1, len(matrix[0])):
matrix[0][i] = gap_penalty(i, A)
#End/start gap penalties in each row
for j in range(1, len(matrix)):
matrix[j][0] = gap_penalty(j, A)
return matrix
def parent_matrix_template(sequences):
"""Fill the parent matrix's first row and first collumn.
The positions are tracing back to position (0,0) so that traceback
function will retrain the whole sequences in the alingment.
Keyword Arguments:
sequences -- A tuple reversely sorted on length
containing two protein sequences
Returns:
A matrix implemented as two dimensional array,
to be queried as matrix[row][col]
"""
parent_matrix = create_empty_matrix(sequences, False)
parent_matrix[0][0] = (0,0)
for i in range(1, len(parent_matrix[0])):
parent_matrix[0][i] = (0,i -1)
for j in range(1, len(parent_matrix)):
parent_matrix[j][0] = (j - 1, 0)
return parent_matrix
def pretty_print(matrix):
"""Prints a two dimensional array tab separated to the console
"""
for line in matrix:
print "\t".join(map(str, line))
def fill_matrix(matrix, parent_matrix, sequences, E = 4, A = 2):
"""Fills the matrix with BLOSUM similarity scores or gaps.
The matrix is filled according to the Needleman-Wunsch algorithm.
each position in the matrix is filled with the best possible alignment
by looking at the previous positions, either left (diagonal left up)
and up of the current position.
Keyword Arguments:
matrix -- two dimensional matrix
parent -- two dimensionsl matrix (same dimensions as matrix)
E -- int, regular gap penalty
A -- int, end gap penalty
"""
#iterate all rows
for i in range(1, len(matrix)):
#iterate all collumns per row
for j in range(1, len(matrix[0])):
left = matrix[i][j-1] + gap_penalty(1, E)
up = matrix[i-1][j] + gap_penalty(1, E)
diag = matrix[i-1][j-1] + score_at_position(i, j, sequences)
#for each position, 3 possible scores are applicable,
#the highest score is chosen.
idx, value = max(enumerate((diag,left,up)), key = lambda x: x[1])
if idx == 0: # if diagonal up
parent = (i - 1, j - 1)
left_gap = False
up_gap = False
elif idx == 1: # if left
parent = (i, j - 1)
up_gap = False
elif idx == 2: # if up
parent = (i - 1, j)
left_gap = False
#assign the score to the matrix
matrix[i][j] = value
#assign which direction was chosen to the parent matrix
parent_matrix[i][j] = parent
return matrix, parent_matrix
def traceback(matrix, parent_matrix, sequences):
i = (len(matrix) - 1)
j = (len(matrix[0]) - 1)
alignA = ""
alignB = ""
seqA = sequences[1]
seqB = sequences[0]
while i != 0 and j != 0:
next_j = parent_matrix[i][j][1]
next_i = parent_matrix[i][j][0]
if j == next_j:
alignA += "-"
else:
alignA += seqA[j-1]
if i == next_i:
alignB += "-"
else:
alignB += seqB[i-1]
i = next_i
j = next_j
return alignA[::-1], alignB[::-1]
def get_identity(seq1, seq2):
"""Returns the percent identity between two aligned sequences
KeywordArguments:
seq1 -- string, aligned protein sequence
seq2 -- string, aligned protein sequence
"""
num_ident = len([1 for i in range(len(seq1)) if seq1[i] == seq2[i]])
return (num_ident * 100) / len(seq1)
def align(seq1, seq2, E = 4, A = 2):
sequences = sorted((seq1, seq2), key = len, reverse = False)
matrix = matrix_template(sequences, A)
parent_matrix = parent_matrix_template(sequences)
matrix, parent_matrix = fill_matrix(matrix, parent_matrix, sequences, E, A)
aligned_seq1, aligned_seq2 = traceback(matrix, parent_matrix, sequences)
alignment_indicator = []
for i in range(len(aligned_seq2)):
if aligned_seq1[i] == aligned_seq2[i]:
alignment_indicator.append('|')
else:
alignment_indicator.append(' ')
print(aligned_seq1)
print("".join(alignment_indicator))
print(aligned_seq2)
print("")
print("Identity: {:.4f}".format(get_identity(
aligned_seq1, aligned_seq2)
)
)
#Printing the score
i = (len(matrix) - 1)
j = (len(matrix[0]) - 1)
print("Alignment Score: {}".format(str(matrix[i][j])))
if __name__ == "__main__":
args = parse_arguments()
seq1 = "THISLINE"
seq2 = "ISALIGNED"
align(args.seqA, args.seqB, E = args.gap,
A = args.end_gap)
print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000)