forked from lawlict/ECAPA-TDNN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ecapa_tdnn.py
154 lines (124 loc) · 5.5 KB
/
ecapa_tdnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import torch
import torch.nn as nn
import torch.nn.functional as F
''' Res2Conv1d + BatchNorm1d + ReLU
'''
class Res2Conv1dReluBn(nn.Module):
'''
in_channels == out_channels == channels
'''
def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False, scale=4):
super().__init__()
assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
self.scale = scale
self.width = channels // scale
self.nums = scale if scale == 1 else scale - 1
self.convs = []
self.bns = []
for i in range(self.nums):
self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
self.bns.append(nn.BatchNorm1d(self.width))
self.convs = nn.ModuleList(self.convs)
self.bns = nn.ModuleList(self.bns)
def forward(self, x):
out = []
spx = torch.split(x, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = sp + spx[i]
# Order: conv -> relu -> bn
sp = self.convs[i](sp)
sp = self.bns[i](F.relu(sp))
out.append(sp)
if self.scale != 1:
out.append(spx[self.nums])
out = torch.cat(out, dim=1)
return out
''' Conv1d + BatchNorm1d + ReLU
'''
class Conv1dReluBn(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False):
super().__init__()
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
self.bn = nn.BatchNorm1d(out_channels)
def forward(self, x):
return self.bn(F.relu(self.conv(x)))
''' The SE connection of 1D case.
'''
class SE_Connect(nn.Module):
def __init__(self, channels, s=2):
super().__init__()
assert channels % s == 0, "{} % {} != 0".format(channels, s)
self.linear1 = nn.Linear(channels, channels // s)
self.linear2 = nn.Linear(channels // s, channels)
def forward(self, x):
out = x.mean(dim=2)
out = F.relu(self.linear1(out))
out = torch.sigmoid(self.linear2(out))
out = x * out.unsqueeze(2)
return out
''' SE-Res2Block.
Note: residual connection is implemented in the ECAPA_TDNN model, not here.
'''
def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
return nn.Sequential(
Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
Res2Conv1dReluBn(channels, kernel_size, stride, padding, dilation, scale=scale),
Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
SE_Connect(channels)
)
''' Attentive weighted mean and standard deviation pooling.
'''
class AttentiveStatsPool(nn.Module):
def __init__(self, in_dim, bottleneck_dim):
super().__init__()
# Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1) # equals W and b in the paper
self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1) # equals V and k in the paper
def forward(self, x):
# DON'T use ReLU here! In experiments, I find ReLU hard to converge.
alpha = torch.tanh(self.linear1(x))
alpha = torch.softmax(self.linear2(alpha), dim=2)
mean = torch.sum(alpha * x, dim=2)
residuals = torch.sum(alpha * x ** 2, dim=2) - mean ** 2
std = torch.sqrt(residuals.clamp(min=1e-9))
return torch.cat([mean, std], dim=1)
''' Implementation of
"ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification".
Note that we DON'T concatenate the last frame-wise layer with non-weighted mean and standard deviation,
because it brings little improvment but significantly increases model parameters.
As a result, this implementation basically equals the A.2 of Table 2 in the paper.
'''
class ECAPA_TDNN(nn.Module):
def __init__(self, in_channels=80, channels=512, embd_dim=192):
super().__init__()
self.layer1 = Conv1dReluBn(in_channels, channels, kernel_size=5, padding=2)
self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8)
self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8)
self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8)
cat_channels = channels * 3
self.conv = nn.Conv1d(cat_channels, 1536, kernel_size=1)
self.pooling = AttentiveStatsPool(1536, 128)
self.bn1 = nn.BatchNorm1d(3072)
self.linear = nn.Linear(3072, embd_dim)
self.bn2 = nn.BatchNorm1d(embd_dim)
def forward(self, x):
x = x.transpose(1, 2)
out1 = self.layer1(x)
out2 = self.layer2(out1) + out1
out3 = self.layer3(out1 + out2) + out1 + out2
out4 = self.layer4(out1 + out2 + out3) + out1 + out2 + out3
out = torch.cat([out2, out3, out4], dim=1)
out = F.relu(self.conv(out))
out = self.bn1(self.pooling(out))
out = self.bn2(self.linear(out))
return out
if __name__ == '__main__':
# Input size: batch_size * seq_len * feat_dim
x = torch.zeros(2, 200, 80)
model = ECAPA_TDNN(in_channels=80, channels=512, embd_dim=192)
out = model(x)
print(model)
print(out.shape) # should be [2, 192]