NaN values when I pass key_padding_mask to MultiHeadAttention

My input data looks like

tensor([[[ 18.6752, -11.8400,   5.1000,   1.4900,  -3.6898,   0.5981,   1.6500,
            2.0000],
         [ 20.6315,   3.3100,   3.5000,   1.8200, -18.5913,  -0.1733,  -3.1300,
            2.0000],
         [ 20.8761,   6.5100,  10.2500,   2.4100,  -4.9920,   0.0291,   3.1200,
            1.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000],
         [  0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
            0.0000]]])

where [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000] is padding.

My forward pass looks like this

x = sensor_data[i].to('cuda')
mask = (x == 0)  # Create padding mask
mask = mask.any(dim=2)
mask = torch.transpose(mask, 0, 1)
x = self.posEmbed(x)
x = self.inputEmbed(x)
encoded_sensor = self.encoder(x, mask)

And the encoder layer looks like this

class EncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, feedforward_dim, dropout_prob=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout_prob)
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.feedforward = nn.Sequential(
            nn.Linear(embed_dim, feedforward_dim),
            nn.ReLU(),
            nn.Linear(feedforward_dim, embed_dim),
        )
        self.layer_norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x, key_padding_mask):
        # x: (seq_length, batch_size, embed_dim)
        # self-attention
        attn_output, _ = self.self_attention(x, x, x, key_padding_mask=key_padding_mask)
        attn_output = self.dropout(attn_output)
        x = self.layer_norm1(x + attn_output)

        # feed-forward
        ff_output = self.feedforward(x)
        ff_output = self.dropout(ff_output)
        x = self.layer_norm2(x + ff_output)

        return x

After the forward pass through encoder, the places with padding becomes nan. But when I remove the key_padding_mask argument it works well. Please help me with this asap. thanks :slight_smile: