I have a simple model for text classification. It has an attention layer after an RNN, which computes a weighted average of the hidden states of the RNN. I sort each batch by length and use
pack_padded_sequence in order to avoid computing the masked timesteps. The model works but i want to apply masking on the attention scores/weights.
Here is my Layer:
class SelfAttention(nn.Module): def __init__(self, hidden_size, batch_first=False): super(SelfAttention, self).__init__() self.hidden_size = hidden_size self.batch_first = batch_first self.att_weights = Parameter(torch.Tensor(1, hidden_size), requires_grad=True) init.xavier_uniform(self.att_weights.data) def get_mask(self): pass def forward(self, inputs): if isinstance(inputs, PackedSequence): # unpack output inputs, lengths = pad_packed_sequence(inputs, batch_first=self.batch_first) if self.batch_first: batch_size, max_len = inputs.size()[:2] else: max_len, batch_size = inputs.size()[:2] # att = torch.mul(inputs, self.att_weights.expand_as(inputs)) # att = att.sum(-1) weights = torch.bmm(inputs, self.att_weights # (1, hidden_size) .permute(1, 0) # (hidden_size, 1) .unsqueeze(0) # (1, hidden_size, 1) .repeat(batch_size, 1, 1) # (batch_size, hidden_size, 1) ) attentions = F.softmax(F.relu(weights.squeeze())) # apply weights weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs)) # get the final fixed vector representations of the sentences representations = weighted.sum(1).squeeze() return representations, attentions
I tried adding this, but it obviously fails with an error
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
... attentions = F.softmax(F.relu(weights.squeeze())) # apply masking based on the sentence lengths for i, l in enumerate(lengths[1:], 1): # skip the first sentence attentions[i, l:] = 0 # apply weights weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs)) ...
How can i mask the attention weights?