-
Notifications
You must be signed in to change notification settings - Fork 0
/
pre-process.py
51 lines (42 loc) · 1.11 KB
/
pre-process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import re
def load_doc(filename):
file = open(filename, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text
raw_text = load_doc('data/shakespeare.txt')
tokens = re.split("\s", raw_text)
raw_text = ' '.join(tokens)
# Remove number of sonnet
raw_text = ''.join([i for i in raw_text if not i.isdigit()])
raw_text = raw_text.lower()
raw_text = re.split("\s", raw_text)
while("" in raw_text) :
raw_text.remove("")
length = 40
sequences = list()
for i in range(length, len(raw_text)):
# select sequence of tokens
seq = raw_text[i-length:i]
# store
sequences.append(seq)
import nltk
nltk.download('punkt')
nltk.download('cmudict')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import cmudict
d_pronoun = cmudict.dict()
tokenizer = RegexpTokenizer('\w[\w|\'|-]*\w|\w')
f = open('data/shakespeare.txt')
line_tokens = []
for line in f:
line = line.strip()
if (line.isdigit()):
continue
if (len(line) > 0):
line = line.lower()
tokens = tokenizer.tokenize(line)
if len(tokens) > 1:
line_tokens.append(tokens)