-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathattention.py
35 lines (28 loc) · 1.23 KB
/
attention.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import torch
import torch.nn as nn
import torch.nn.functional as F
class Attention(nn.Module):
def __init__(self, dim):
super(Attention, self).__init__()
self.linear_out = nn.Linear(dim*2, dim)
self.mask = None
self.dropout = nn.Dropout(p=0.2)
def set_mask(self, mask):
self.mask = mask
def forward(self, output, context):
context = self.dropout(context)
batch_size = output.size(0)
hidden_size = output.size(2)
input_size = context.size(1)
# (batch, out_len, dim) * (batch, in_len, dim) -> (batch, out_len, in_len)
attn = torch.bmm(output, context.transpose(1, 2))
if self.mask is not None:
attn.data.masked_fill_(self.mask, -float('inf'))
attn = F.softmax(attn.view(-1, input_size), dim=1).view(batch_size, -1, input_size)
# (batch, out_len, in_len) * (batch, in_len, dim) -> (batch, out_len, dim)
mix = torch.bmm(attn, context)
# concat -> (batch, out_len, 2*dim)
combined = torch.cat((mix, output), dim=2)
# output -> (batch, out_len, dim)
output = F.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size)
return output, attn