forked from pender/chatbot-rnn
-
Notifications
You must be signed in to change notification settings - Fork 46
/
utils.py
executable file
·242 lines (221 loc) · 12.4 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import codecs
import os
import io
import collections
import cPickle
from bz2 import BZ2File
import numpy as np
class TextLoader():
# Call this class to load text from a file.
def __init__(self, data_dir, batch_size, seq_length, encoding='utf-8'):
# TextLoader model remembers its initialization arguments.
self.data_dir = data_dir
self.batch_size = batch_size
self.seq_length = seq_length
self.encoding = encoding
self.tensor_sizes = []
self.tensor_file_template = os.path.join(data_dir, "data{}.npz")
vocab_file = os.path.join(data_dir, "vocab.pkl")
sizes_file = os.path.join(data_dir, "sizes.pkl")
self.input_files = self._get_input_file_list(data_dir)
self.input_file_count = len(self.input_files)
if self.input_file_count < 1:
raise ValueError("Input files not found. File names must end in '.txt' or '.bz2'.")
if self._preprocess_required(vocab_file, sizes_file, self.tensor_file_template, self.input_file_count):
# If either the vocab file or the tensor file doesn't already exist, create them.
print("Preprocessing the following files: {}".format(self.input_files))
vocab_counter = collections.Counter()
for i in xrange(self.input_file_count):
print("reading vocab from input file {}".format(self.input_files[i]))
self._augment_vocab(vocab_counter, self.input_files[i])
print("saving vocab file")
self._save_vocab(vocab_counter, vocab_file)
for i in xrange(self.input_file_count):
print("preprocessing input file {}".format(self.input_files[i]))
self._preprocess(self.input_files[i], self.tensor_file_template.format(i))
self.tensor_sizes.append(self.tensor.size)
with open(sizes_file, 'wb') as f:
cPickle.dump(self.tensor_sizes, f)
print ("processed input text file: {} characters loaded".format(self.tensor.size))
else:
# If the vocab file and sizes file already exist, load them.
print "loading vocab file"
self._load_vocab(vocab_file)
print "loading sizes file"
with open(sizes_file, 'rb') as f:
self.tensor_sizes = cPickle.load(f)
self.tensor_batch_counts = [n / (self.batch_size * self.seq_length) for n in self.tensor_sizes]
self.total_batch_count = sum(self.tensor_batch_counts)
print("total batch count: {}".format(self.total_batch_count))
self.tensor_index = -1
def _preprocess_required(self, vocab_file, sizes_file, tensor_file_template, input_file_count):
if not os.path.exists(vocab_file):
print("No vocab file found. Preprocessing...")
return True
if not os.path.exists(sizes_file):
print("No sizes file found. Preprocessing...")
return True
for i in xrange(input_file_count):
if not os.path.exists(tensor_file_template.format(i)):
print ("Couldn't find {}. Preprocessing...".format(tensor_file_template.format(i)))
return True
return False
def _get_input_file_list(self, data_dir):
suffixes = ['.txt', '.bz2']
input_file_list = []
if os.path.isdir(data_dir):
for walk_root, walk_dir, walk_files in os.walk(data_dir):
for file_name in walk_files:
if file_name.startswith("."): continue
file_path = os.path.join(walk_root, file_name)
if file_path.endswith(suffixes[0]) or file_path.endswith(suffixes[1]):
input_file_list.append(file_path)
else: raise ValueError("Not a directory: {}".format(data_dir))
return sorted(input_file_list)
def _augment_vocab(self, vocab_counter, input_file):
# Load up the input.txt file and use it to create a vocab file and a tensor file
# at the specified file paths.
if input_file.endswith(".bz2"): file_reference = BZ2File(input_file, "r")
elif input_file.endswith(".txt"): file_reference = io.open(input_file, "r")
raw_data = file_reference.read()
file_reference.close()
u_data = raw_data.encode(encoding=self.encoding)
vocab_counter.update(u_data)
def _save_vocab(self, vocab_counter, vocab_file):
# count_pairs is a list of these dictionary entries, sorted in descending order.
# The first item of the list is a 2-item tuple of the most common character
# and the number of times it occurs, then the second-most common, etc. -- e.g.:
# [(' ', 17), ('a', 11), ('e', 7), ('n', 7), ...]
count_pairs = sorted(vocab_counter.items(), key=lambda x: -x[1])
# self.chars is a tuple (immutable ordered list) of characters, in descending order
# from most common to least. E.g.:
# (' ', 'a', 'e', 'n', 't', ...)
# This is a lookup device to convert index number to character.
# How does this work?
# zip(*___) returns an iterator of tuples, where the i-th tuple contains
# the i-th element from each of the argument sequences or iterables.
# So zip(*count_pairs) returns an iterator over two tuples, the first tuple being
# characters in descending order of frequency, and the second being the frequency
# of the same characters.
# list() then packages these two tuples into a list of the same two tuples,
# and the assignment passes the first tuple (characters in descending order) to self.chars
# and the second (character counts) to a disregarded variable.
self.chars, _ = list(zip(*count_pairs))
# self.vocab_size counts the number of characters used in input.txt.
self.vocab_size = len(self.chars)
# self.vocab is a dictionary that maps each character to its index number. For example:
# [(' ', 0), ('a', 1), ('e', 2), ('n', 3), ...]
# This is a lookup device to convert a character to its index number.
self.vocab = dict(zip(self.chars, range(len(self.chars))))
# Save the characters tuple to vocab.pkl (tiny file).
with open(vocab_file, 'wb') as f:
cPickle.dump(self.chars, f)
print("saved vocab (vocab size: {})".format(self.vocab_size))
def _load_vocab(self, vocab_file):
# Load the character tuple (vocab.pkl) to self.chars.
# Remember that it is in descending order of character frequency in the data.
with open(vocab_file, 'rb') as f:
self.chars = cPickle.load(f)
# Use the character tuple to regenerate vocab_size and the vocab dictionary.
self.vocab_size = len(self.chars)
self.vocab = dict(zip(self.chars, range(len(self.chars))))
def _preprocess(self, input_file, tensor_file):
if input_file.endswith(".bz2"): file_reference = BZ2File(input_file, "r")
elif input_file.endswith(".txt"): file_reference = io.open(input_file, "r")
raw_data = file_reference.read()
file_reference.close()
data = raw_data.encode(encoding=self.encoding)
# Convert the entirety of the data file from characters to indices via the vocab dictionary.
# How? map(function, iterable) returns a list of the output of the function
# executed on each member of the iterable. E.g.:
# [14, 2, 9, 2, 0, 6, 7, 0, ...]
# np.array converts the list into a numpy array.
self.tensor = np.array(map(self.vocab.get, data))
# Compress and save the numpy tensor array to data.npz.
np.savez_compressed(tensor_file, tensor_data=self.tensor)
def _load_preprocessed(self, tensor_index):
self.reset_batch_pointer()
if tensor_index == self.tensor_index:
return
print("loading tensor data file {}".format(tensor_index))
tensor_file = self.tensor_file_template.format(tensor_index)
# Load the data tensor file to self.tensor.
with np.load(tensor_file) as loaded:
self.tensor = loaded['tensor_data']
self.tensor_index = tensor_index
# Calculate the number of batches in the data. Each batch is batch_size x seq_length,
# so this is just the input data size divided by that product, rounded down.
self.num_batches = self.tensor.size / (self.batch_size * self.seq_length)
if self.tensor_batch_counts[tensor_index] != self.num_batches:
print("Error in batch size! Expected {}; found {}".format(self.tensor_batch_counts[tensor_index],
self.num_batches))
# Chop off the end of the data tensor so that the length of the data is a whole
# multiple of the (batch_size x seq_length) product.
# Do this with the slice operator on the numpy array.
self.tensor = self.tensor[:self.num_batches * self.batch_size * self.seq_length]
# Construct two numpy arrays to represent input characters (xdata)
# and target characters (ydata).
# In training, we will feed in input characters one at a time, and optimize along
# a loss function computed against the target characters.
# (We do this with batch_size characters at a time, in parallel.)
# Since this is a sequence prediction net, the target is just the input right-shifted
# by 1.
xdata = self.tensor
ydata = np.copy(self.tensor) # Y-data starts as a copy of x-data.
ydata[:-1] = xdata[1:] # Right-shift y-data by 1 using the numpy array slice syntax.
# Replace the very last character of y-data with the first character of the input data.
ydata[-1] = xdata[0]
# Split our unidemnsional data array into distinct batches.
# How? xdata.reshape(self.batch_size, -1) returns a 2D numpy tensor view
# in which the first dimension is the batch index (from 0 to num_batches),
# and the second dimension is the index of the character within the batch
# (from 0 to (batch_size x seq_length)).
# Within each batch, characters follow the same sequence as in the input data.
# Then, np.split(that 2D numpy tensor, num_batches, 1) gives a list of numpy arrays.
# Say batch_size = 4, seq_length = 5, and data is the following string:
# "Here is a new string named data. It is a new string named data. It is named data."
# We truncate the string to lop off the last period (so there are now 80 characters,
# which is evenly divisible by 4 x 5). After xdata.reshape, we have:
#
# [[Here is a new string],
# [ named data. It is a],
# [ new string named da],
# [ta. It is named data]]
#
# After np.split, we have:
# <[[Here ], <[[is a ], <[[new s], <[[tring],
# [ name], [d dat], [a. It], [ is a],
# [ new ], [strin], [g nam], [ed da],
# [ta. I]]>, [t is ]]>, [named]]>, [ data]]>
#
# where the first item of the list is the numpy array on the left.
# Thus x_batches is a list of numpy arrays. The first dimension of each numpy array
# is the batch number (from 0 to batch_size), and the second dimension is the
# character index (from 0 to seq_length).
#
# These will be fed to the model one at a time sequentially.
# State is preserved between sequential batches.
#
self.x_batches = np.split(xdata.reshape(self.batch_size, -1), self.num_batches, 1)
self.y_batches = np.split(ydata.reshape(self.batch_size, -1), self.num_batches, 1)
def next_batch(self):
if self.tensor_index < 0:
self._load_preprocessed(0)
if self.pointer >= self.num_batches:
self._load_preprocessed((self.tensor_index + 1) % self.input_file_count)
x, y = self.x_batches[self.pointer], self.y_batches[self.pointer]
self.pointer += 1
return x, y
def reset_batch_pointer(self):
self.pointer = 0
def cue_batch_pointer_to_epoch_fraction(self, epoch_fraction):
step_target = (epoch_fraction - int(epoch_fraction)) * self.total_batch_count
self._cue_batch_pointer_to_step_count(step_target)
def _cue_batch_pointer_to_step_count(self, step_target):
for i, n in enumerate(self.tensor_batch_counts):
if step_target < n:
break
step_target -= n
self.pointer = n
self.current_tensor_index = i
self._load_preprocessed(i)