-
Notifications
You must be signed in to change notification settings - Fork 38
/
vggish.py
118 lines (87 loc) · 3.98 KB
/
vggish.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""VGGish model for Keras. A VGG-like model for audio classification
# Reference
- [CNN Architectures for Large-Scale Audio Classification](ICASSP 2017)
"""
from __future__ import print_function
from __future__ import absolute_import
import sys
sys.path.append('/home/hudi/anaconda2/lib/python2.7/site-packages/h5py')
sys.path.append('/home/hudi/anaconda2/lib/python2.7/site-packages/Keras-2.0.6-py2.7.egg')
from keras.models import Model
from keras.layers import Flatten, Dense, Input, Conv2D, MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D
from keras.engine.topology import get_source_inputs
from keras import backend as K
import vggish_params as params
# weight path
WEIGHTS_PATH = '/home/brain/Documents/git/VGGish/vggish_audioset_weights_without_fc2.h5'
def VGGish(load_weights=True, weights='audioset',
input_tensor=None, input_shape=None,
out_dim=None, include_top=True, pooling='avg'):
'''
An implementation of the VGGish architecture.
:param load_weights: if load weights
:param weights: loads weights pre-trained on a preliminary version of YouTube-8M.
:param input_tensor: input_layer
:param input_shape: input data shape
:param out_dim: output dimension
:param include_top:whether to include the 3 fully-connected layers at the top of the network.
:param pooling: pooling type over the non-top network, 'avg' or 'max'
:return: A Keras model instance.
'''
if weights not in {'audioset', None}:
raise ValueError('The `weights` argument should be either '
'`None` (random initialization) or `audioset` '
'(pre-training on audioset).')
if out_dim is None:
out_dim = params.EMBEDDING_SIZE
# input shape
if input_shape is None:
input_shape = (params.NUM_FRAMES, params.NUM_BANDS, 1)
if input_tensor is None:
aud_input = Input(shape=input_shape, name='input_1')
else:
if not K.is_keras_tensor(input_tensor):
aud_input = Input(tensor=input_tensor, shape=input_shape, name='input_1')
else:
aud_input = input_tensor
# Block 1
x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv1')(aud_input)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)
# Block 2
x = Conv2D(128, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv2')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)
# Block 3
x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_1')(x)
x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_2')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)
# Block 4
x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_1')(x)
x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_2')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)
if include_top:
# FC block
x = Flatten(name='flatten_')(x)
x = Dense(4096, activation='relu', name='vggish_fc1/fc1_1')(x)
x = Dense(4096, activation='relu', name='vggish_fc1/fc1_2')(x)
x = Dense(out_dim, activation='relu', name='vggish_fc2')(x)
else:
if pooling == 'avg':
x = GlobalAveragePooling2D()(x)
elif pooling == 'max':
x = GlobalMaxPooling2D()(x)
if input_tensor is not None:
inputs = get_source_inputs(input_tensor)
else:
inputs = aud_input
# Create model.
model = Model(inputs, x, name='VGGish')
# load weights
if load_weights:
if weights == 'audioset':
if include_top:
model.load_weights(WEIGHTS_PATH_TOP)
else:
model.load_weights(WEIGHTS_PATH)
else:
print("failed to load weights")
return model