Hi everyone,

for several days I have been trying to implement a self-attention mechanism for a bilstm. The code I wrote, looking for some resources on the web, for attention is the following:

```
class Attention(nn.Module):
def __init__(self, hidden_size, batch_first=False):
super(Attention, self).__init__()
self.hidden_size = hidden_size
self.batch_first = batch_first
self.att_weights = nn.Parameter(torch.Tensor(1, hidden_size), requires_grad=True)
stdv = 1.0 / np.sqrt(self.hidden_size)
for weight in self.att_weights:
nn.init.uniform_(weight, -stdv, stdv)
def forward(self, inputs, lengths):
batch_size, max_len = inputs.size()[:2]
# matrix mult
# apply attention layer
weights = torch.bmm(inputs,
self.att_weights # (1, hidden_size)
.permute(1, 0) # (hidden_size, 1)
.unsqueeze(0) # (1, hidden_size, 1)
.repeat(batch_size, 1, 1) # (batch_size, hidden_size, 1)
)
attentions = torch.softmax(F.relu(weights.squeeze(2)), dim=-1)
# create mask based on the sentence lengths
mask = torch.ones(attentions.size(), requires_grad=True).cuda()
for i, l in enumerate(lengths): # skip the first sentence
if l < max_len:
mask[i, l:] = 0
# apply mask and renormalize attention scores (weights)
masked = attentions * mask
_sums = masked.sum(-1).unsqueeze(-1) # sums per row
attentions = masked.div(_sums)
# apply attention weights
weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs))
# get the final fixed vector representations of the sentences
representations = weighted.sum(1)
# (batch_size, n_lstm_unit) and (batch_size, sentence_len)
return representations, attentions
class MyLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, batch_first, bidirectional, dropout):
super(MyLSTM, self).__init__()
self.dropout = nn.Dropout(p=dropout)
self.lstm1 = nn.LSTM(input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=batch_first,
bidirectional=bidirectional,
dropout=dropout)
self.atten1 = Attention(hidden_size * 2, batch_first=batch_first) # 2 is bidrectional
def forward(self, x, x_len):
x = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=True)
out1, (h_n, c_n) = self.lstm1(x)
x, lengths = nn.utils.rnn.pad_packed_sequence(out1, batch_first=True)
x, att1 = self.atten1(x, lengths) # skip connect
tmp1 = torch.bmm(x.unsqueeze(2), att1.unsqueeze(1))
tmpp1 = tmp1.transpose(1, 2)
return tmpp1
```

What I get after the attention are two tensors of dimension, respectively, `(batch_size, n_lstm_unit)`

and `(batch_size, sentence_len)`

. What I necessarily need is to have a single tensor of dimension `(batch_size, sentence_len, n_lstm_unit)`

.

As suggested by a forum user, what I could do is something like this:

```
tmp1 = torch.bmm(x.unsqueeze(2), att1.unsqueeze(1))
tmpp1 = tmp1.transpose(1, 2)
```

The problem is that in this way it seems that the performance of my parser, after inserting the attention, deteriorates.

I wanted to know if the performance hit could be caused by this ` torch.bmm(x.unsqueeze(2), att1.unsqueeze(1))`

operation to get a single vector of that size or if it is an attention implementation problem. Thanks a lot to everyone.