This repository has been archived by the owner on Feb 23, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 55
/
SpeechEmbedding.py
89 lines (60 loc) · 1.7 KB
/
SpeechEmbedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# coding: utf-8
# In[2]:
import sys
import torch
import torch.nn as nn
from torch.autograd import Variable
import librosa
import torch.nn.functional as F
from Modules.SpectralProcessing import PreNet
from Modules.Conv1dGLU import Conv1dGLU
from Modules.Encoder import Attention
# In[ ]:
batch_size = 64
N_samples = 23
# In[55]:
class Encoder(nn.Module):
global batch_size
global N_samples
def __init__(self):
super(Encoder, self).__init__()
self.prenet = PreNet()
self.conv = Conv1dGLU()
self.attention = Attention(128)
self.prohead = nn.Linear(128,1)
self.residual_conv = nn.Linear(128,512)
self.bn = nn.BatchNorm1d(N_samples)
def forward(self, x):
#print(x)
x = self.prenet(x)
x = x.view(batch_size*N_samples, x.size(2), x.size(3)).transpose(1,2)
x = self.conv(x)
x = x.transpose(1,2)
x.contiguous()
x = x.view(batch_size,N_samples,x.size(1),x.size(2))
#x = librosa.decompose.hpss(x)[0]
x = x.mean(dim=2)
conv_out = x
conv_out = self.residual_conv(conv_out)
x.contiguous()
#print(x)
x = self.attention(x)
#print(x)
x = self.prohead(x)
x = torch.squeeze(x)
x = F.softsign(x)
x = self.bn(x)
x = torch.unsqueeze(x, dim=2)
x = torch.bmm(x.transpose(1,2), conv_out)
x = torch.squeeze(x)
return x
# In[56]:
# enc = Encoder()
# In[57]:
# z = torch.randn(25,20,100,80)
# In[58]:
# out = enc(Variable(z))
# out
# In[63]:
#def Temp_Masking(x):
#Create function for temporal masking. Use librosa.decompose.hpss. Split and concatinate dimensions to make it 2D.