-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnmt_model_keras.py
428 lines (320 loc) · 17.2 KB
/
nmt_model_keras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# -*- coding: utf-8 -*-
"""nmt_model_keras.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/11gQLXFCtY8g2IBwMBPUU4ymRSNz6NlPK
"""
from keras.layers import Embedding,LSTM,Dropout,Dense,Layer
from keras import Model,Input
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
import keras.backend as K
import collections
import numpy as np
import time
from nltk.translate.bleu_score import corpus_bleu
class LanguageDict():
def __init__(self, sents):
word_counter = collections.Counter(tok.lower() for sent in sents for tok in sent)
self.vocab = []
self.vocab.append('<pad>') #zero paddings
self.vocab.append('<unk>')
# add only words that appear at least 10 times in the corpus
self.vocab.extend([t for t,c in word_counter.items() if c > 10])
self.word2ids = {w:id for id, w in enumerate(self.vocab)}
self.UNK = self.word2ids['<unk>']
self.PAD = self.word2ids['<pad>']
def load_dataset(source_path,target_path, max_num_examples=30000):
''' This helper method reads from the source and target files to load max_num_examples
sentences, split them into train, development and testing and return relevant data.
Inputs:
source_path (string): the full path to the source data, SOURCE_PATH
target_path (string): the full path to the target data, TARGET_PATH
Returns:
train_data (list): a list of 3 elements: source_words, target words, target word labels
dev_data (list): a list of 2 elements - source words, target word labels
test_data (list): a list of 2 elements - source words, target word labels
source_dict (LanguageDict): a LanguageDict object for the source language, Vietnamese.
target_dict (LanguageDict): a LanguageDict object for the target language, English.
'''
# source_lines/target lines are list of strings such that each string is a sentence in the
# corresponding file. len(source/target_lines) <= max_num_examples
source_lines = open(source_path).readlines()
target_lines = open(target_path).readlines()
assert len(source_lines) == len(target_lines)
if max_num_examples > 0:
max_num_examples = min(len(source_lines), max_num_examples)
source_lines = source_lines[:max_num_examples]
target_lines = target_lines[:max_num_examples]
# strip trailing/leading whitespaces and tokenize each sentence.
source_sents = [[tok.lower() for tok in sent.strip().split(' ')] for sent in source_lines]
target_sents = [[tok.lower() for tok in sent.strip().split(' ')] for sent in target_lines]
# for the target sentences, add <start> and <end> tokens to each sentence
for sent in target_sents:
sent.append('<end>')
sent.insert(0,'<start>')
# create the LanguageDict objects for each file
source_lang_dict = LanguageDict(source_sents)
target_lang_dict = LanguageDict(target_sents)
# for the source sentences.
# we'll use this to split into train/dev/test
unit = len(source_sents)//10
# get the sents-as-ids for each sentence
source_words = [[source_lang_dict.word2ids.get(tok,source_lang_dict.UNK) for tok in sent] for sent in source_sents]
# 8 parts (80%) of the sentences go to the training data. pad upto maximum sentence length
source_words_train = pad_sequences(source_words[:8*unit],padding='post')
# 1 parts (10%) of the sentences go to the dev data. pad upto maximum sentence length
source_words_dev = pad_sequences(source_words[8*unit:9*unit],padding='post')
# 1 parts (10%) of the sentences go to the test data. pad upto maximum sentence length
source_words_test = pad_sequences(source_words[9*unit:],padding='post')
eos = target_lang_dict.word2ids['<end>']
# for each sentence, get the word index for the tokens from <start> to up to but not including <end>,
target_words = [[target_lang_dict.word2ids.get(tok,target_lang_dict.UNK) for tok in sent[:-1]] for sent in target_sents]
# select the training set and pad the sentences
target_words_train = pad_sequences(target_words[:8*unit],padding='post')
# the label for each target word is the next word after it
target_words_train_labels = [sent[1:]+[eos] for sent in target_words[:8*unit]]
# pad the labels. Dim = [num_sents, max_sent_lenght]
target_words_train_labels = pad_sequences(target_words_train_labels,padding='post')
# expand dimensions Dim = [num_sents, max_sent_lenght, 1].
target_words_train_labels = np.expand_dims(target_words_train_labels,axis=2)
# get the labels for the dev and test data. No need for inputs here. no need to expand dimensions
target_words_dev_labels = pad_sequences([sent[1:] + [eos] for sent in target_words[8 * unit:9 * unit]], padding='post')
target_words_test_labels = pad_sequences([sent[1:] + [eos] for sent in target_words[9 * unit:]], padding='post')
# we have our data.
train_data = [source_words_train,target_words_train,target_words_train_labels]
dev_data = [source_words_dev,target_words_dev_labels]
test_data = [source_words_test,target_words_test_labels]
return train_data,dev_data,test_data,source_lang_dict,target_lang_dict
class AttentionLayer(Layer):
def compute_mask(self, inputs, mask=None):
if mask == None:
return None
return mask[1]
def compute_output_shape(self, input_shape):
return (input_shape[1][0],input_shape[1][1],input_shape[1][2]*2)
def call(self, inputs, mask=None):
encoder_outputs, decoder_outputs = inputs
"""
Task 3 attention
Start
"""
#This attention mechanism calculates the score between encoder outputs and decoder outputs using matrix multiplication.
#But we need to transpose the last two dimensions of decoder outputs which is done as follows.
decoder_outputs_Trans = K.permute_dimensions(decoder_outputs,(0,2,1))
#luong_score is calculated by the transposed decoder output and encoder outputs
luong_score = K.batch_dot(encoder_outputs, decoder_outputs_Trans)
luong_score = K.softmax(luong_score,1)
luong_score = K.expand_dims(luong_score,-1)
encoder_outputs = K.expand_dims(encoder_outputs,2)
# Now expanding the dimensions which will give us the same shape and then calculating the encoder_vector as below.
encoder_vector = encoder_outputs * luong_score
#Then passing encoder_vector as the multiplication of encoder_outputs and luong_score.
encoder_vector = K.sum(encoder_vector,axis = 1, keepdims= False)
"""
End Task 3
"""
new_decoder_outputs = K.concatenate([decoder_outputs, encoder_vector])
return new_decoder_outputs
class NmtModel(object):
def __init__(self,source_dict,target_dict,use_attention):
''' The model initialization function initializes network parameters.
Inputs:
source_dict (LanguageDict): a LanguageDict object for the source language, Vietnamese.
target_dict (LanguageDict): a LanguageDict object for the target language, English.
use_attention (bool): if True, use attention.
Returns:
None.
'''
# the number of hidden units used by the LSTM
self.hidden_size = 200
# the size of the word embeddings being used
self.embedding_size = 100
# the dropout rate for the hidden layers
self.hidden_dropout_rate=0.2
# the dropout rate for the word embeddings
self.embedding_dropout_rate = 0.2
# batch size
self.batch_size = 100
# the maximum length of the target sentences
self.max_target_step = 30
# vocab size for source and target; we'll use everything we receive
self.vocab_target_size = len(target_dict.vocab)
self.vocab_source_size = len(source_dict.vocab)
# intances of the dictionaries
self.target_dict = target_dict
self.source_dict = source_dict
# special tokens to indicate sentence starts and ends.
self.SOS = target_dict.word2ids['<start>']
self.EOS = target_dict.word2ids['<end>']
# use attention or no
self.use_attention = use_attention
print("number of tokens in source: %d, number of tokens in target:%d" % (self.vocab_source_size,self.vocab_target_size))
def build(self):
#-------------------------Train Models------------------------------
source_words = Input(shape=(None,),dtype='int32')
target_words = Input(shape=(None,), dtype='int32')
"""
Task 1 encoder
Start
"""
# The train encoder
# (a.) Create two randomly initialized embedding lookups, one for the source, another for the target.
print('Task 1(a): Creating the embedding lookups...')
#Here we are creating two embeddings layer one is for source and one is for the target
#We can see that the mask_zero is true which is to reove padding.
#whereas the input size is the vocabulary size of source and target.
#The output dimesnion for both is the embedding size definded earlier
#Finally we add the layers of bith source and target to source and target embeddings respectively.
embedding_source = Embedding(self.vocab_source_size, self.embedding_size, name='embedding_source', mask_zero = True)
embedding_target = Embedding(self.vocab_target_size, self.embedding_size, name='embedding_target', mask_zero = True)
# (b.) Look up the embeddings for source words and for target words. Apply dropout each encoded input
print('\nTask 1(b): Looking up source and target words...')
#source_word_embeddings = Dropout(rate=0.2)(source_embeddings)
dropout_Layer = Dropout(rate = self.embedding_dropout_rate)
source_words_embeddings = embedding_source(source_words)
#Here we have created to dropout layer one for encoder and one for decoder, with the dropout layer defined before.
source_words_embeddings = dropout_Layer(source_words_embeddings)
target_words_embeddings = embedding_target(target_words)
target_words_embeddings = dropout_Layer(target_words_embeddings)
#target_words_embeddings = Dropout(rate=0.2)(target_embeddings)
# (c.) An encoder LSTM() with return sequences set to True
print('\nTask 1(c): Creating an encoder')
#Here we have created an encoder using LSTM with return_sequence and return_state is False
encoder_lstm = LSTM(self.hidden_size,recurrent_dropout=self.hidden_dropout_rate,return_sequences=True,return_state=True)
encoder_outputs, encoder_state_h, encoder_state_c =encoder_lstm(source_words_embeddings)
"""
End Task 1
"""
encoder_states = [encoder_state_h,encoder_state_c]
# The train decoder
decoder_lstm = LSTM(self.hidden_size,recurrent_dropout=self.hidden_dropout_rate,return_sequences=True,return_state=True)
decoder_outputs_train,_,_ = decoder_lstm(target_words_embeddings,initial_state=encoder_states)
if self.use_attention:
decoder_attention = AttentionLayer()
decoder_outputs_train = decoder_attention([encoder_outputs,decoder_outputs_train])
decoder_dense = Dense(self.vocab_target_size,activation='softmax')
decoder_outputs_train = decoder_dense(decoder_outputs_train)
# compiling the train model.
adam = Adam(lr=0.01,clipnorm=5.0)
self.train_model = Model([source_words,target_words], decoder_outputs_train)
self.train_model.compile(optimizer=adam,loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# at this point you can print model summary for the train model
print('\t\t\t\t\t\t Train Model Summary.')
self.train_model.summary()
#-------------------------Inference Models------------------------------
# The inference encoder
self.encoder_model = Model(source_words,[encoder_outputs,encoder_state_h,encoder_state_c])
# at this point you can print the summary for the encoder model.
print('\t\t\t\t\t\t Inference Time Encoder Model Summary.')
self.encoder_model.summary()
# The decoder model
# specifying the inputs to the decoder
decoder_state_input_h = Input(shape=(self.hidden_size,))
decoder_state_input_c = Input(shape=(self.hidden_size,))
encoder_outputs_input = Input(shape=(None,self.hidden_size,))
"""
Task 2 decoder for inference
Start
"""
# Task 1 (a.) Get the decoded outputs
print('\n Putting together the decoder states')
# get the inititial states for the decoder, decoder_states
# decoder states are the hidden and cell states from the training stage
decoder_states = [decoder_state_input_h, decoder_state_input_c]
# use decoder states as input to the decoder lstm to get the decoder outputs, h, and c for test time inference
decoder_outputs_test,decoder_state_output_h, decoder_state_output_c = decoder_lstm(target_words_embeddings,initial_state=decoder_states)
# Creating decoder LSTM which tales the target word embeddings and the list of decoder states.
# Task 1 (b.) Add attention if attention
#Since we have to experiment with both attention adn not attention we can add attention here.
#Here this layer takes encoder_outputs_input and decoder_outputs_test as the parameters.
if self.use_attention:
decoder_outputs_test = decoder_attention([encoder_outputs_input,decoder_outputs_test])
# Task 1 (c.) pass the decoder_outputs_test (with or without attention) to the decoder dense layer
decoder_outputs_test = decoder_dense(decoder_outputs_test)
"""
End Task 2
"""
# put the model together
self.decoder_model = Model([target_words,decoder_state_input_h,decoder_state_input_c,encoder_outputs_input],
[decoder_outputs_test,decoder_state_output_h,decoder_state_output_c])
# you can now view the model summary
print('\t\t\t\t\t\t Decoder Inference Model summary')
print(self.decoder_model.summary())
def time_used(self, start_time):
curr_time = time.time()
used_time = curr_time-start_time
m = used_time // 60
s = used_time - 60 * m
return "%d m %d s" % (m, s)
def train(self,train_data,dev_data,test_data, epochs):
start_time = time.time()
for epoch in range(epochs):
print("Starting training epoch {}/{}".format(epoch + 1, epochs))
epoch_time = time.time()
source_words_train, target_words_train, target_words_train_labels = train_data
self.train_model.fit([source_words_train,target_words_train],target_words_train_labels,batch_size=self.batch_size)
print("Time used for epoch {}: {}".format(epoch + 1, self.time_used(epoch_time)))
dev_time = time.time()
print("Evaluating on dev set after epoch {}/{}:".format(epoch + 1, epochs))
self.eval(dev_data)
print("Time used for evaluate on dev set: {}".format(self.time_used(dev_time)))
print("Training finished!")
print("Time used for training: {}".format(self.time_used(start_time)))
print("Evaluating on test set:")
test_time = time.time()
self.eval(test_data)
print("Time used for evaluate on test set: {}".format(self.time_used(test_time)))
def get_target_sentences(self, sents,vocab,reference=False):
str_sents = []
num_sent, max_len = sents.shape
for i in range(num_sent):
str_sent = []
for j in range(max_len):
t = sents[i,j].item()
if t == self.SOS:
continue
if t == self.EOS:
break
str_sent.append(vocab[t])
if reference:
str_sents.append([str_sent])
else:
str_sents.append(str_sent)
return str_sents
def eval(self, dataset):
# get the source words and target_word_labels for the eval dataset
source_words, target_words_labels = dataset
vocab = self.target_dict.vocab
# using the same encoding network used during training time, encode the training
encoder_outputs, state_h,state_c = self.encoder_model.predict(source_words,batch_size=self.batch_size)
# for max_target_step steps, feed the step target words into the decoder.
predictions = []
step_target_words = np.ones([source_words.shape[0],1]) * self.SOS
for _ in range(self.max_target_step):
step_decoder_outputs, state_h,state_c = self.decoder_model.predict([step_target_words,state_h,state_c,encoder_outputs],batch_size=self.batch_size)
step_target_words = np.argmax(step_decoder_outputs,axis=2)
predictions.append(step_target_words)
# predictions is a [time_step x batch_size x 1] array. We use get_target_sentence()
#to recover the batch_size sentences
candidates = self.get_target_sentences(np.concatenate(predictions,axis=1),vocab)
references = self.get_target_sentences(target_words_labels,vocab,reference=True)
# score using nltk bleu scorer
score = corpus_bleu(references,candidates)
print("Model BLEU score: %.2f" % (score*100.0))
print("===== PRINTING OUTPUT====")
for i in range(5):
print("{} Sample".format(i+1))
print("Predicted Sentence: "+" ".join(candidates[i]))
print("Actual Sentence: "+" ".join(references[i][0]))
print("========================")
# Here we are printing the predicted sentence and actual sentence for each each epochs
def main(source_path, target_path, use_attention):
max_example = 30000
#use_attention = True
print('loading dictionaries')
train_data, dev_data, test_data, source_dict, target_dict = load_dataset(source_path,target_path,max_num_examples=max_example)
print("read %d/%d/%d train/dev/test batches" % (len(train_data[0]),len(dev_data[0]), len(test_data[0])))
model = NmtModel(source_dict,target_dict,use_attention)
model.build()
model.train(train_data,dev_data,test_data,10)