-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrnnlm_batch.py
232 lines (187 loc) · 7.1 KB
/
rnnlm_batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
from collections import Counter, defaultdict
from itertools import count
import random
import time
import _gdynet as dy
import numpy as np
# format of files: each line is "word1/tag2 word2/tag2 ..."
train_file = "C:\\corpora\\long30k.txt"
test_file = "C:\\corpora\\long30k.txt"
MB_SIZE = 100
class Vocab:
def __init__(self, w2i=None):
if w2i is None: w2i = defaultdict(count(0).next)
self.w2i = dict(w2i)
self.i2w = {i:w for w,i in w2i.iteritems()}
@classmethod
def from_corpus(cls, corpus):
w2i = defaultdict(count(0).next)
for sent in corpus:
[w2i[word] for word in sent]
return Vocab(w2i)
def size(self): return len(self.w2i.keys())
def read(fname):
"""
Read a file where each line is of the form "word1 word2 ..."
Yields lists of the form [word1, word2, ...]
"""
count = 0
with file(fname) as fh:
for line in fh:
count = count + 1
sent = line.lower().strip().split()
#sent.append("<stop>")
sent = ["<start>"] + sent + ["<stop>"]
sent.reverse()
if count <= 2775000 and len(sent) <=42:
yield sent
train = list(read(train_file))
test = list(read(test_file))
words = []
wc = Counter()
for sent in train:
for w in sent:
words.append(w)
wc[w] += 1
vw = Vocab.from_corpus([words])
STOP = vw.w2i["<stop>"]
START = vw.w2i["<start>"]
nwords = vw.size()
LAYERS = 2
INPUT_DIM = 200 #50 #256
HIDDEN_DIM = 300 # 50 #1024
print "words", nwords
# DyNet Starts
dy.init()
model = dy.Model()
#trainer = dy.AdamTrainer(model)
trainer = dy.SimpleSGDTrainer(model)
# Lookup parameters for word embeddings
WORDS_LOOKUP = model.add_lookup_parameters((nwords, INPUT_DIM))
# Word-level LSTM (layers=1, input=64, output=128, model)
RNN = dy.LSTMBuilder(LAYERS, INPUT_DIM, HIDDEN_DIM, model)
# Softmax weights/biases on top of LSTM outputs
W_sm = model.add_parameters((nwords, HIDDEN_DIM))
b_sm = model.add_parameters(nwords)
# Build the language model graph
def calc_lm_loss(sents):
dy.renew_cg()
# parameters -> expressions
W_exp = dy.parameter(W_sm)
b_exp = dy.parameter(b_sm)
# initialize the RNN
f_init = RNN.initial_state()
# get the wids and masks for each step
tot_words = 0
wids = []
masks = []
for i in range(len(sents[0])):
wids.append([(vw.w2i[sent[i]] if len(sent) > i else STOP) for sent in sents])
mask = [(1 if len(sent) > i else 0) for sent in sents]
masks.append(mask)
tot_words += sum(mask)
# start the rnn by inputting "<start>"
init_ids = [START] * len(sents)
s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids))
# feed word vectors into the RNN and predict the next word
losses = []
for wid, mask in zip(wids, masks):
# calculate the softmax and loss
score = W_exp * s.output() + b_exp
loss = dy.pickneglogsoftmax_batch(score, wid)
# mask the loss if at least one sentence is shorter
if mask[-1] != 1:
mask_expr = dy.inputVector(mask)
mask_expr = dy.reshape(mask_expr, (1,), MB_SIZE)
loss = loss * mask_expr
losses.append(loss)
# update the state of the RNN
wemb = dy.lookup_batch(WORDS_LOOKUP, wid)
s = s.add_input(wemb)
return dy.sum_batches(dy.esum(losses)), tot_words
num_tagged = cum_loss = 0
# Sort training sentences in descending order and count minibatches
train.sort(key=lambda x: -len(x))
#test.sort(key=lambda x: -len(x))
train_order = [x*MB_SIZE for x in range(len(train)/MB_SIZE)]
#test_order = [x*MB_SIZE for x in range(len(test)/MB_SIZE)]
#print "Train order", train_order
start_time = time.time()
# Perform training
for ITER in xrange(11):
random.shuffle(train_order)
print "Shuffled"
for i, sid in enumerate(train_order, 1):
print "Batch", i, "of", len(train)/MB_SIZE, "iter", str(ITER)
loss_exp, mb_words = calc_lm_loss(train[sid:sid+MB_SIZE])
cum_loss += loss_exp.scalar_value()
num_tagged += mb_words
loss_exp.backward()
trainer.update()
print "epoch %r finished" % ITER
model.save("C:\\corpora\\back_batch_bigmodel.txt", [RNN, WORDS_LOOKUP, W_sm, b_sm])
#Log the line
log_file = "C:\\corpora\\log.txt"
logline = "iteration back" + " " + str(ITER) + "\n"
with open(log_file, "a") as myfile:
myfile.write(logline)
trainer.update_epoch(1.0)
'''
(RNN, WORDS_LOOKUP, W_sm, b_sm) = model.load("C:\\corpora\\batch_bigmodel.txt")
def predictNextWord( sentence, builder, wlookup, mR, mB):
dy.renew_cg()
init_state = builder.initial_state()
R = dy.parameter(mR)
bias = dy.parameter(mB)
state = init_state
for cw in sentence:
# assume word is already a word-id
x_t = dy.lookup(wlookup, int(cw))
state = state.add_input(x_t)
y_t = state.output()
r_t = bias + (R * y_t)
prob = dy.softmax(r_t)
return prob
sentence = "the results clearly"
start_sentence = ['start'] + sentence.split(" ")
#start_sentence = ['<start>', 'the', 'results', 'clearly']
isent = [vw.w2i[w] for w in start_sentence]
print "ISENT", isent
distribution = predictNextWord(isent, RNN, WORDS_LOOKUP, W_sm, b_sm).npvalue()
for i in range(0, 20):
max_index = np.argmax(distribution)
print i, vw.w2i.keys()[vw.w2i.values().index(max_index)]
distribution = np.delete(distribution, max_index)
sentence = "the results clearly"
start_sentence = ['start'] + sentence.split(" ")
#start_sentence = ['<start>', 'the', 'results', 'clearly']
isent = [vw.w2i[w] for w in start_sentence]
print "ISENT", isent
distribution = predictNextWord(isent, RNN, WORDS_LOOKUP, W_sm, b_sm).npvalue()
for i in range(0, 20):
max_index = np.argmax(distribution)
print i, vw.w2i.keys()[vw.w2i.values().index(max_index)]
distribution = np.delete(distribution, max_index)
sentence = "for the collocation extraction"
start_sentence = ['start'] + sentence.split(" ")
#start_sentence = ['<start>', 'the', 'results', 'clearly']
isent = [vw.w2i[w] for w in start_sentence]
print "ISENT", isent
distribution = predictNextWord(isent, RNN, WORDS_LOOKUP, W_sm, b_sm).npvalue()
for i in range(0, 20):
max_index = np.argmax(distribution)
print i, vw.w2i.keys()[vw.w2i.values().index(max_index)]
distribution = np.delete(distribution, max_index)
sentence = "prioritizing these alerts will help security personnel focus their efforts on the"
start_sentence = ['start'] + sentence.split(" ")
#start_sentence = ['<start>', 'the', 'results', 'clearly']
isent = [vw.w2i[w] for w in start_sentence]
print "ISENT", isent
distribution = predictNextWord(isent, RNN, WORDS_LOOKUP, W_sm, b_sm).npvalue()
for i in range(0, 20):
max_index = np.argmax(distribution)
print i, vw.w2i.keys()[vw.w2i.values().index(max_index)]
distribution = np.delete(distribution, max_index)
'''
end_time = time.time()
print "Time:", end_time-start_time