forked from suriyadeepan/practical_seq2seq
-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_utils.py
69 lines (55 loc) · 1.75 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
from random import sample
'''
split data into train (70%), test (15%) and valid(15%)
return tuple( (trainX, trainY), (testX,testY), (validX,validY) )
'''
def split_dataset(x, y, ratio = [0.7, 0.15, 0.15] ):
# number of examples
data_len = len(x)
lens = [ int(data_len*item) for item in ratio ]
trainX, trainY = x[:lens[0]], y[:lens[0]]
testX, testY = x[lens[0]:lens[0]+lens[1]], y[lens[0]:lens[0]+lens[1]]
validX, validY = x[-lens[-1]:], y[-lens[-1]:]
return (trainX,trainY), (testX,testY), (validX,validY)
'''
generate batches from dataset
yield (x_gen, y_gen)
TODO : fix needed
'''
def batch_gen(x, y, batch_size):
# infinite while
while True:
for i in range(0, len(x), batch_size):
if (i+1)*batch_size < len(x):
yield x[i : (i+1)*batch_size ].T, y[i : (i+1)*batch_size ].T
'''
generate batches, by random sampling a bunch of items
yield (x_gen, y_gen)
'''
def rand_batch_gen(x, y, batch_size):
while True:
sample_idx = sample(list(np.arange(len(x))), batch_size)
yield x[sample_idx].T, y[sample_idx].T
#'''
# convert indices of alphabets into a string (word)
# return str(word)
#
#'''
#def decode_word(alpha_seq, idx2alpha):
# return ''.join([ idx2alpha[alpha] for alpha in alpha_seq if alpha ])
#
#
#'''
# convert indices of phonemes into list of phonemes (as string)
# return str(phoneme_list)
#
#'''
#def decode_phonemes(pho_seq, idx2pho):
# return ' '.join( [ idx2pho[pho] for pho in pho_seq if pho ])
'''
a generic decode function
inputs : sequence, lookup
'''
def decode(sequence, lookup, separator=''): # 0 used for padding, is ignored
return separator.join([ lookup[element] for element in sequence if element ])