-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathrnn_utils.py
65 lines (55 loc) · 2.22 KB
/
rnn_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
import os
import re
from collections import defaultdict
import operator
def load_matrix_imdb(path='imdb.npz', num_words=None, skip_top=0,
maxlen=None, seed=113,
start_char=1, oov_char=2, index_from=3, **kwargs):
"""
Modified code from Keras
Loads data matrixes from npz file, crops and pads seqs and returns
shuffled (x_train, y_train), (x_test, y_test)
"""
if not os.path.exists(path):
print("Downloading matrix data into current folder")
os.system("wget https://s3.amazonaws.com/text-datasets/imdb.npz")
with np.load(path) as f:
x_train, labels_train = f['x_train'], f['y_train']
x_test, labels_test = f['x_test'], f['y_test']
np.random.seed(seed)
indices = np.arange(len(x_train))
np.random.shuffle(indices)
x_train = x_train[indices]
labels_train = labels_train[indices]
indices = np.arange(len(x_test))
np.random.shuffle(indices)
x_test = x_test[indices]
labels_test = labels_test[indices]
xs = np.concatenate([x_train, x_test])
labels = np.concatenate([labels_train, labels_test])
if start_char is not None:
xs = [[start_char] + [w + index_from for w in x] for x in xs]
elif index_from:
xs = [[w + index_from for w in x] for x in xs]
if not num_words:
num_words = max([max(x) for x in xs])
if not maxlen:
maxlen = max([len(x) for x in xs])
# by convention, use 2 as OOV word
# reserve 'index_from' (=3 by default) characters:
# 0 (padding), 1 (start), 2 (OOV)
xs_new = []
for x in xs:
x = x[:maxlen] # crop long sequences
if oov_char is not None: # replace rare or frequent symbols
x = [w if (skip_top <= w < num_words) else oov_char for w in x]
else: # or filter rare and frequent symbols
x = [w for w in x if skip_top <= w < num_words]
x_padded = np.zeros(maxlen)#, dtype = 'int32')
x_padded[-len(x):] = x
xs_new.append(x_padded)
idx = len(x_train)
x_train, y_train = np.array(xs_new[:idx]), np.array(labels[:idx])
x_test, y_test = np.array(xs_new[idx:]), np.array(labels[idx:])
return (x_train, y_train), (x_test, y_test)