-
Notifications
You must be signed in to change notification settings - Fork 8
/
sentence_to_sentence_generation.py
245 lines (177 loc) · 9.78 KB
/
sentence_to_sentence_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# -*- coding: utf-8 -*-
# from __future__ import print_function
from keras.models import Sequential, slice_X
from keras.layers.core import Activation, TimeDistributedDense, RepeatVector, Dropout, Dense
from keras.layers.recurrent import LSTM, GRU
from keras.layers.embeddings import Embedding
import numpy as np
import random, sys
from keras.callbacks import EarlyStopping
import re
import itertools
import shutil
from textblob import TextBlob
from textblob import WordList
import time
from nltk.corpus import wordnet as wn
import nltk
import random
from keras.preprocessing import sequence
from keras.datasets import imdb
import sys
import string
"""
The Idea is to first read all of the sentences of a book. Afterwards, the model should be able to write a sentence after it has been fed a previous sentence.
In other words, if you feed the model "Stars are very bright in the night sky.", the model should write something like, "They shine all across the universe."
"Sequence to Sequence Learning with Neural Networks"
http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
Theoretically it introduces shorter term dependencies between source and target.
"""
'''Start of indexing your words here'''
number_of_iterations = 400 #max number of iterations/epochs you want to run for.
start_validating_at_iteration = 0 #which iteration to start checking for overfitting
patience = 10 #number of times where validation loss has to rise before quitting algorithm
hidden_variables = 256 #number of hidden variables in all layers, keep this betwen 256 and 512
dropout = 0.5 #the dropout between each layer -- recommendation is 0.5
embedding_size = 256 #this should be 128 or 256 or 512
hidden_size = 256
batch_size = 1024
decoder_layers = 3 #number of lstm layers for decoding -- try 4, maybe eventually up to 8.
time_steps = 50 #number to pad the vector by. Should be the max number of words in a sentence.
vocabulary_size = 10000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
'''*************************************PROCESSING ASTRONOMY BOOK TEXT**************************'''
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print ("Reading text file...")
text = open('AstronomyWikiBook.txt').read()
'''here you add code to filter out wikipedia references and more'''
text = re.sub('(Further reading|Bibliography|Books|External links|See also|Sources|References|Cited texts)\n\n(- .+\n| .+\n)*', '',text)
text = re.sub('(Further reading|Bibliography|Books|External links|See also|Sources|References|Cited texts)\n(- .+\n| .+\n)*', '',text)
text = re.sub('(Footnotes|References|Notes)\n\n\[\d*\].(.+\n|.+\n)*', '',text)
text = re.sub('\[\d*\].(.+\n|.+\n)*', '',text)
text = re.sub('Main article:.+\n', '',text)
text = re.sub('(²|³|¹|⁴|⁵|⁶|⁷|⁸|⁹|⁰)', '',text)
text = re.sub('( | | | )', '',text)
text = re.sub('(\n\n\n|\n\n\n\n|\n\n\n\n\n)', '\n\n',text)
text = re.sub('\n\n', ' ~ ',text)
text = re.sub('\n', ' ',text)
print ('-'*50)
sentences = (nltk.sent_tokenize(text.decode('utf-8').lower()))
# Append SENTENCE_START and SENTENCE_END
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
#you can adjust the sentence start and end
print ("Parsed %d sentences." % (len(sentences)))
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
# print (tokenized_sentences) #this is not a list of lists
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print ("Found %d unique words tokens." % len(word_freq.items()))
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1) #this is a list at this point of words in most frequently
index_to_word = ['padzero'] #indicates that you have a word for zero's
for x in vocab:
index_to_word.append(x[0]) #this is based upon frequency of word used -- nice
index_to_word.append(unknown_token) #this is making the word for all unknown words here
print '-----------------------------------------------------------------------------------------------'
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
print
print
print ("Using vocabulary size %d." % vocabulary_size)
print ("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
print ("\nExample sentence: '%s'" % sentences[0])
print ("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])
'''the x train should be the revers order of the first sentence
the y train should be the next sentence that comes after x sentence'''
X_train_array = ([[word_to_index[w] for w in sent[::-1]] for sent in tokenized_sentences[:-1]])
#the ::-1 is to reverse the list
X_train = sequence.pad_sequences(X_train_array, maxlen=time_steps)
y_train_array = ([[word_to_index[w] for w in sent] for sent in tokenized_sentences[1:]])
y_train = sequence.pad_sequences(y_train_array, maxlen=time_steps)
print '* '*50
'''------------------------------KERAS MODEL DESIGN HERE ----------------------------'''
print('Building model...')
increased_vocab = (int(len(vocab))+2+1)
print ('the increased vocab is')
print increased_vocab
model = Sequential()
#its important to realize that this embedding layer adds an extra dimension.
model.add(Embedding(increased_vocab, embedding_size, input_length=time_steps, mask_zero=True))
#the masking will block out the zeros from padding -- really useful for recurrent layers
'''the input of the lstm is a 3d shape, the output is either a 2d or 3d shape.
If you set the return_sequence to true, it will be a 3d shape
The below layer is your encoding layer'''
model.add(LSTM(output_dim=embedding_size))
# model.add(Dense(hidden_size)) #seen this in other models -- these two might be useful to test
# model.add(Activation('relu')
# '''this layer is the repeat vector, and somehow its useful, but i'm not sure why'''
model.add(RepeatVector(time_steps))
'''DECODER STARTS HERE'''
for z in xrange(decoder_layers-1):
model.add(LSTM(output_dim=embedding_size, return_sequences=True))
model.add(Dropout(dropout))
model.add(LSTM(output_dim=embedding_size, return_sequences=False))
model.add(Dropout(dropout))
'''their digits is the equivalent of how many words you want total in a sentence'''
''' something really important here is the decoder can be multiple lstms. This is where you would stack your lstms and apply dropouts!'''
model.add(Dense(output_dim=time_steps))
#stateful rnn's might be super useful here but i don't know
model.add(Activation('time_distributed_softmax')) #this is good
print ('you are starting to compile')
model.compile(loss='categorical_crossentropy', optimizer='adam') #these settings are good
print ('you just finished compiling')
'''*************************Advanced Modifications For Much Later On in Development************'''
early_stopping = EarlyStopping(monitor='val_loss', patience=patience)
def sample(a, temperature=1.0): #i want to use this in the prediction output but I'm not quite sure how to at this point. Not a priority right now.
a = np.log(a)/temperature
a = np.exp(a)/np.sum(np.exp(a))
return np.argmax(np.random.multinomial(1,a,1))
'''**********************************Train the model*******************************'''
for iteration in range(1, number_of_iterations):
print
print('-' * 50)
print('Iteration', iteration)
model.fit(X_train, y_train, verbose=1, validation_split=0.2, callbacks=[early_stopping], nb_epoch=1, batch_size = batch_size, show_accuracy=True)
random_sentence = random.choice(tokenized_sentences) #this is a list of words -- shoot
print 'below is random_sentence'
print random_sentence
print
reverse_random_sentence = random_sentence[::-1]
copy_reverse_random_sentence = reverse_random_sentence
print
print('The starter sentence is: ')
print
for word in random_sentence:
sys.stdout.write(word)
sys.stdout.write(' ')
# for diversity in [0.2, 0.4, 0.6, 0.75]:
for diversity in [0.75]:
print
print('----- diversity:', diversity)
#now convert the sentence you've chosen back into words
reverse_random_sentence = copy_reverse_random_sentence
translated_reverse_random_sentence = []
for eachword2 in reverse_random_sentence:
translated_reverse_random_sentence.append(index_to_word.index(eachword2))
for iteration in range(2):
#for the prediction model, you need to input a sentence in reverse order.
#It should be a list of numbers that are represented by the words.
# translated_reverse_random_sentence_padded = sequence.pad_sequences(np.array(translated_reverse_random_sentence), maxlen=time_steps)
sampletry = X_train[np.random.randint(10,size=1),:]
#okay so the prediction function takes 2d input, because you gave it 2d input. Grrr....
#but I notice it yields a one dimensional output -- i think this i because of the dense layer at the bottom?
preds = model.predict(sampletry,batch_size=batch_size,verbose=0)[0]
print
print('preds is shown below')
print (preds)
# for eachword1 in preds:
# sys.stdout.write((index_to_word[eachword1]))
# sys.stdout.write(' ')
# translated_reverse_random_sentence = preds[::-1]
# sys.stdout.flush()
print