-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
executable file
·74 lines (56 loc) · 2.93 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import glob
import tensorflow as tf
from tensorflow.python.client import device_lib
print(f"Available: {[x.name for x in device_lib.list_local_devices()]}")
import numpy as np
import os
from scipy.io.wavfile import read
from sklearn.model_selection import train_test_split
from util import transform_audio, plot_results
from config import DURATION, SAMPLE_RATE, WINDOW_SIZE, WINDOW_STRIDE, WINDOW
lr = 0.001
# Import data
path = "./wav_x"
audios = [os.path.abspath(file) for file in glob.glob(f"{path}/*.wav")]
datapoints = []
for audio in audios:
sr, a = read(audio)
a = np.array(a, dtype=float)
if a.shape[0] != DURATION:
a = np.pad(a, (0, DURATION - a.shape[0]), 'constant')
datapoints.append(transform_audio(a, SAMPLE_RATE, WINDOW_SIZE, WINDOW_STRIDE, WINDOW))
# x_train, x_test = datapoints[:-20], datapoints[-20:]
print(f"Dataset size: {len(datapoints)} samples")
x_train, x_test = train_test_split(datapoints, test_size=0.1, random_state=42)
x_train = np.array(x_train)
x_test = np.array(x_test)
print(f"Training size: {x_train.shape}")
print(f"Val size: {x_test.shape}")
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
original_dim = x_train.shape[1]
# this is the siez of our encoded representations
ENCODING_DIM = 5
input = tf.keras.layers.Input(shape=(original_dim,))
encoded = tf.keras.layers.Dense(ENCODING_DIM*100, activation='relu')(input)
encoded2 = tf.keras.layers.Dense(ENCODING_DIM*10, activation='relu')(encoded)
encoded3 = tf.keras.layers.Dense(ENCODING_DIM*5, activation='relu')(encoded2)
encoded4 = tf.keras.layers.Dense(ENCODING_DIM, activation='relu', activity_regularizer=tf.keras.regularizers.l1(10e-5), name='encoder')(encoded3)
decoded3 = tf.keras.layers.Dense(ENCODING_DIM*5, activation='relu')(encoded4)
decoded2 = tf.keras.layers.Dense(ENCODING_DIM*10, activation='relu')(decoded3)
decoded = tf.keras.layers.Dense(ENCODING_DIM*100, activation='relu')(decoded2)
output = tf.keras.layers.Dense(original_dim, activation='linear')(decoded)
autoencoder = tf.keras.models.Model(input, output)
print(autoencoder.summary())
optimizer = tf.keras.optimizers.Adam(lr=lr)
autoencoder.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['accuracy'])
encoder = tf.keras.models.Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('encoder').output)
# with added regularization, the model is less likely to overfit and can be trained
# longer. The model ends with a train loss of 0.11 and test loss of 0.10. The difference is mostly due to
# the regularization term being added to the loss during training
autoencoder.fit(x_train, x_train, epochs=5000, batch_size=64, shuffle=True, validation_data=(x_test, x_test))
encoded_audio = encoder.predict(x_test)
print(encoded_audio)
decoded_audio = autoencoder.predict(x_train)
plot_results(test=x_train, decoded=decoded_audio, n=12)
autoencoder.save('./saved_models/crazy_bird_max.h5')