-
Notifications
You must be signed in to change notification settings - Fork 53
/
suffix_tree.py
198 lines (156 loc) · 5.5 KB
/
suffix_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
class Node:
def __init__(self, string, start, end):
"""
Suffix tree node.
:param string: whole string
:param start: index of beginning
:param end: index of end
"""
self.string = string
self.start = start
self.end = end
self.next = dict() # Link to child
self.counter = None # Count number of all childs
self.is_leaf = False # Is this node a leaf
def __len__(self):
return self.end - self.start
def __str__(self):
return self.string[self.start: self.end]
def split(self, i, node):
"""
Split node at index i.
:param i: index
:param node: another child
:return: splitted node
"""
new_node = Node(self.string, self.start, self.start + i)
self.start += i
new_node.next = {self.string[self.start]: self,
self.string[node.start]: node}
return new_node
def add_suffix_link(self, destination):
"""
Add suffex link.
"""
self.suffix_link = destination
# destination.reverse_suffix_link.append(self)
def update_counter(self):
"""
Update child counter.
:return: number of children
"""
if not self.next:
self.is_leaf = True
self.counter = 1
else:
self.is_leaf = False
self.counter = sum([child.update_counter() for child in self.next.values()])
return self.counter
def visualize(self, current_depth=0):
"""
Visualize recursively current node and children.
:param current_depth: current node depth.
"""
if current_depth == 0:
print("="*20)
if len(self) > 10:
print("\t" * current_depth + "...")
else:
print("\t" * current_depth + str(self))
for child in self.next.values():
child.visualize(current_depth + 1)
class Cursor:
def __init__(self, node, branch, length, root=None):
self.node = node
self.branch = branch
self.length = length
self.root = root
def current_char(self):
return str(self.node.next[self.branch])[self.length]
def move_forward(self, char):
self.length += 1
if self.length >= len(self.node.next[self.branch]):
self.node = self.node.next[self.branch]
self.branch = char
self.length -= len(self.node)
def move_front_forward(self, char):
assert self.root is not None
try:
self.node = self.node.suffix_link
except AttributeError:
assert self.length > 0
self.length -= 1
self.branch = char
self.node = self.root
@property
def current_node(self):
return self.node.next[self.branch]
def print(self):
self.node.visualize()
print("Branch: %s, length:%d" % (self.branch, self.length))
class SuffixTree:
def __init__(self, string):
self.string = string
self.root = Node(string, 0, 0)
self.cursor = self.root
self.branch = string[0]
self.length = 0
self.remaining = 0
self.construct(string)
def construct(self, string):
update_interval = len(string)//20
for (i, char) in enumerate(string):
if i % update_interval == 1:
print("|", end="", flush=True)
self.add_char(i, char)
print()
def add_char(self, i, char):
self.remaining += 1
prev = None
while self.remaining > 0:
if self.length == 0:
self.branch = char
try:
branch = self.cursor.next[self.branch]
except KeyError:
branch = self.cursor.next[self.branch] = Node(self.string, i, len(self.string))
if prev is not None:
prev.add_suffix_link(self.cursor)
prev = None
else:
if self.length >= len(branch):
self.length -= len(branch)
self.branch = self.string[i - self.length]
self.cursor = branch
continue
if self.string[branch.start + self.length] == char:
if prev is not None and self.cursor is not self.root:
prev.add_suffix_link(self.cursor)
self.length += 1
return
new_node = Node(self.string, i, len(self.string))
branch = self.cursor.next[self.branch] = branch.split(self.length, new_node)
if prev:
prev.add_suffix_link(branch)
prev = branch
self.remaining -= 1
if self.cursor is self.root:
if self.length > 0:
self.length -= 1
self.branch = self.string[i - self.remaining + 1]
else:
try:
self.cursor = self.cursor.suffix_link
except AttributeError:
self.cursor = self.root
def update_counter(self):
self.root.update_counter()
def query_cursor(self, string):
assert string
cursor = Cursor(self.root, string[0], 0, self.root)
for char in string[1:]:
cursor.move_forward(char)
return cursor
def query(self, string):
cursor = self.query_cursor(string)
return cursor.current_node