-
Notifications
You must be signed in to change notification settings - Fork 0
/
needleman-wunsch-f-matrix-v2.py
118 lines (100 loc) · 5.09 KB
/
needleman-wunsch-f-matrix-v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# coding=utf8
import numpy
# The alphabet defines the order of amino acids used for the score matrix
# so A is the first character, R is the second etc.
alphabet = "ARNDCQEGHILKMFPSTWYV"
# The BLOSUM50 matrix in third-bit units, or 3log2(odds ratio)
# We store it as a 2D array using the numpy package
blosum50 = numpy.array([
[ 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-2,-1,-1,-3,-1, 1, 0,-3,-2, 0],
[-2, 7,-1,-2,-4, 1, 0,-3, 0,-4,-3, 3,-2,-3,-3,-1,-1,-3,-1,-3],
[-1,-1, 7, 2,-2, 0, 0, 0, 1,-3,-4, 0,-2,-4,-2, 1, 0,-4,-2,-3],
[-2,-2, 2, 8,-4, 0, 2,-1,-1,-4,-4,-1,-4,-5,-1, 0,-1,-5,-3,-4],
[-1,-4,-2,-4,13,-3,-3,-3,-3,-2,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1],
[-1, 1, 0, 0,-3, 7, 2,-2, 1,-3,-2, 2, 0,-4,-1, 0,-1,-1,-1,-3],
[-1, 0, 0, 2,-3, 2, 6,-3, 0,-4,-3, 1,-2,-3,-1,-1,-1,-3,-2,-3],
[ 0,-3, 0,-1,-3,-2,-3, 8,-2,-4,-4,-2,-3,-4,-2, 0,-2,-3,-3,-4],
[-2, 0, 1,-1,-3, 1, 0,-2,10,-4,-3, 0,-1,-1,-2,-1,-2,-3, 2,-4],
[-1,-4,-3,-4,-2,-3,-4,-4,-4, 5, 2,-3, 2, 0,-3,-3,-1,-3,-1, 4],
[-2,-3,-4,-4,-2,-2,-3,-4,-3, 2, 5,-3, 3, 1,-4,-3,-1,-2,-1, 1],
[-1, 3, 0,-1,-3, 2, 1,-2, 0,-3,-3, 6,-2,-4,-1, 0,-1,-3,-2,-3],
[-1,-2,-2,-4,-2, 0,-2,-3,-1, 2, 3,-2, 7, 0,-3,-2,-1,-1, 0, 1],
[-3,-3,-4,-5,-2,-4,-3,-4,-1, 0, 1,-4, 0, 8,-4,-3,-2, 1, 4,-1],
[-1,-3,-2,-1,-4,-1,-1,-2,-2,-3,-4,-1,-3,-4,10,-1,-1,-4,-3,-3],
[ 1,-1, 1, 0,-1, 0,-1, 0,-1,-3,-3, 0,-2,-3,-1, 5, 2,-4,-2,-2],
[ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 2, 5,-3,-2, 0],
[-3,-3,-4,-5,-5,-1,-3,-3,-3,-3,-2,-3,-1, 1,-4,-4,-3,15, 2,-3],
[-2,-1,-2,-3,-3,-1,-2,-3, 2,-1,-1,-2, 0, 4,-3,-2,-2, 2, 8,-1],
[ 0,-3,-3,-4,-1,-3,-3,-4,-4, 4, 1,-3, 1,-1,-3,-2, 0,-3,-1, 5],
])
# Specify a linear GAP penalty of -8
gap_penalty = -8
# Our X and Y sequences to align
x = "MAMRLLKTHL"
y = "MKNITCYL"
# Set up our F-matrix using a two dimensional score_matrix to record the
# intermediate values and a three dimensional pointer_matrix to record the
# arrows. The first and second axes of the score_matrix and pointer_matrix
# correspond to the X and Y sequences respectively. The third axis of the
# pointer_matrix records the presence (1) or absence (0) of arrows in the
# diagonal (align both), left (align X with gap), and up (align Y with gap) at
# indices 0, 1 and 2 respectively. Because with have the first row and first
# column filled in with gap penalties, the row and column indices
# corresponding to the X and Y residues will begin at 1.
score_matrix = numpy.zeros((len(x) + 1, len(y) + 1), dtype = int)
pointer_matrix = numpy.zeros((len(x) + 1, len(y) + 1, 3), dtype = int)
# Fill in first row and column with the linear gap penalties
for xi, xaa in enumerate(x):
score_matrix[xi + 1, 0] = gap_penalty * (xi + 1)
pointer_matrix[xi + 1, 0, 1] = 1
for yi, yaa in enumerate(y):
score_matrix[0, yi + 1] = gap_penalty * (yi + 1)
pointer_matrix[0, yi + 1, 2] = 1
# Fill in middle values starting, from left to right in the top row, then from
# left to right in the second row and so on. The yi and xi indices begin at 0.
for yi, yaa in enumerate(y):
for xi, xaa in enumerate(x):
# Get the index (position) of the X and Y amino acids in our alphabet
xaai = alphabet.index(xaa)
yaai = alphabet.index(yaa)
# These are the indices to the X and Y positions in the F-matrix,
# which begin at 1 because of the the first row and first column of
# gap penalties.
fxi = xi + 1
fyi = yi + 1
# Look up the score for the X and Y pair of residues in our BLOSUM50 matrix
residue_score = blosum50[xaai, yaai]
# Calculate the possible scores for aligning both residues...
no_gap_score = score_matrix[fxi - 1, fyi - 1] + residue_score # (fxi - 1, fyi - 1) is the diagonally up and left cell
# ...and for aligning the X residue with a gap...
x_gap_score = score_matrix[fxi - 1, fyi] + gap_penalty # (fxi - 1, fyi) is the cell to the left of the current cell
# ...and for aligning the Y residue with a gap.
y_gap_score = score_matrix[fxi, fyi - 1] + gap_penalty # (fxi, fyi - 1) is the cell above the current cell
# Identify the maximum score value from among the three options...
max_score = max(no_gap_score, x_gap_score, y_gap_score)
# ...and set the score of this F-matrix cell to that score.
score_matrix[fxi, fyi] = max_score
# Add arrows for any direction where choosing that direction
# will result in the maximum score for this F-matrix cell
if no_gap_score == max_score:
pointer_matrix[fxi, fyi, 0] = 1
if x_gap_score == max_score:
pointer_matrix[fxi, fyi, 1] = 1
if y_gap_score == max_score:
pointer_matrix[fxi, fyi, 2] = 1
# Print the Needleman-Wunsch F-matrix with scores and pointers to the command line
x_length = len(x) + 1
y_length = len(y) + 1
print_matrix = numpy.zeros([y_length * 2, x_length * 2], dtype = "U3")
print_matrix.fill(" ")
for xi in range(x_length):
for yi in range(y_length):
print_matrix[yi * 2 + 1][xi * 2 + 1] = "%3d" % score_matrix[xi][yi]
if pointer_matrix[xi][yi][0]: # diagonal arrow
print_matrix[yi * 2][xi * 2] = u" ↖ "
if pointer_matrix[xi][yi][1]: # left arrow
print_matrix[yi * 2 + 1][xi * 2] = u" ← "
if pointer_matrix[xi][yi][2]: # up arrow
print_matrix[yi * 2][xi * 2 + 1] = u" ↑ "
for row in print_matrix[1:]:
print("".join(row[1:]))