-
Notifications
You must be signed in to change notification settings - Fork 1
/
align_nw.py
executable file
·121 lines (109 loc) · 3.7 KB
/
align_nw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import sys
def theta(a, b):
if a == '-' or b == '-' or a != b: # gap or mismatch
return -1
elif a == b: # match
return 1
def make_score_matrix(seq1, seq2):
"""
return score matrix and map(each score from which direction)
0: from diagonal
1: from left
2: from up
"""
seq1 = '-' + seq1
seq2 = '-' + seq2
score_mat = {}
trace_mat = {}
for i,p in enumerate(seq1):
score_mat[i] = {} # matrix with two layer dict
trace_mat[i] = {}
for j,q in enumerate(seq2):
if i == 0: # first row, gap in seq1
score_mat[i][j] = -j
trace_mat[i][j] = 1
continue
if j == 0: # first column, gap in seq2
score_mat[i][j] = -i
trace_mat[i][j] = 2
continue
ul = score_mat[i-1][j-1] + theta(p, q) # from up-left, mark 0
l = score_mat[i][j-1] + theta('-', q) # from left, mark 1, gap in seq1
u = score_mat[i-1][j] + theta(p, '-') # from up, mark 2, gap in seq2
picked = max([ul,l,u])
score_mat[i][j] = picked # record value
trace_mat[i][j] = [ul, l, u].index(picked) # record direction
return score_mat, trace_mat
def traceback(seq1, seq2, trace_mat):
'''
find one optimal traceback path from trace matrix, return path code
-!- CAUTIOUS: if multiple equally possible path exits, only return one of them -!-
'''
seq1, seq2 = '-' + seq1, '-' + seq2
i, j = len(seq1) - 1, len(seq2) - 1
path_code = ''
while i > 0 or j > 0:
direction = trace_mat[i][j]
if direction == 0: # from diagonal
i = i-1
j = j-1
path_code = '0' + path_code
elif direction == 1: # from left
j = j-1
path_code = '1' + path_code
elif direction == 2: # from up
i = i-1
path_code = '2' + path_code
return path_code
def print_m(seq1, seq2, m):
"""print score matrix or trace matrix"""
seq1 = '-' + seq1; seq2 = '-' + seq2
print()
print(' '.join(['%3s' % i for i in ' '+seq2]))
for i, p in enumerate(seq1):
line = [p] + [m[i][j] for j in range(len(seq2))]
print(' '.join(['%3s' % i for i in line]))
print()
return
def pretty_print_align(seq1, seq2, path_code):
'''
return pair alignment result string from
path code: 0 for match, 1 for gap in seq1, 2 for gap in seq2
'''
align1 = ''
middle = ''
align2 = ''
for p in path_code:
if p == '0':
align1 = align1 + seq1[0]
align2 = align2 + seq2[0]
if seq1[0] == seq2[0]:
middle = middle + '|'
else:
middle = middle + ' '
seq1 = seq1[1:]
seq2 = seq2[1:]
elif p == '1':
align1 = align1 + '-'
align2 = align2 + seq2[0]
middle = middle + ' '
seq2 = seq2[1:]
elif p == '2':
align1 = align1 + seq1[0]
align2 = align2 + '-'
middle = middle + ' '
seq1 = seq1[1:]
print(' %s\n %s \n %s' % (align1, middle, align2))
return
def main():
try:
seq1, seq2 = sys.argv[1:3]
except:
print('Usage: python align_nw.py seq1 seq2')
print('--Demo:--')
seq1, seq2 = 'TCATC','TCATGGC'
score_mat, trace_mat = make_score_matrix(seq1, seq2)
path_code = traceback(seq1, seq2, trace_mat)
pretty_print_align(seq1, seq2, path_code)
if __name__ == '__main__':
main()