-
Notifications
You must be signed in to change notification settings - Fork 41
/
parse.py
40 lines (30 loc) · 1.02 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import sys
import re
class Parser:
SENTENCE_START_SYMBOL = '^'
SENTENCE_END_SYMBOL = '$'
def __init__(self, name, db, sentence_split_char = '\n', word_split_char = ''):
self.name = name
self.db = db
self.sentence_split_char = sentence_split_char
self.word_split_char = word_split_char
self.whitespace_regex = re.compile('\s+')
def parse(self, txt):
depth = self.db.get_depth()
sentences = txt.split(self.sentence_split_char)
i = 0
for sentence in sentences:
sentence = self.whitespace_regex.sub(" ", sentence).strip()
list_of_words = None
if self.word_split_char:
list_of_words = sentence.split(self.word_split_char)
else:
list_of_words = list(sentence.lower())
words = [Parser.SENTENCE_START_SYMBOL] * (depth - 1) + list_of_words + [Parser.SENTENCE_END_SYMBOL] * (depth - 1)
for n in range(0, len(words) - depth + 1):
self.db.add_word(words[n:n+depth])
i += 1
if i % 1000 == 0:
print(i)
sys.stdout.flush()
self.db.commit()