My input data looks like
tensor([[[ 18.6752, -11.8400, 5.1000, 1.4900, -3.6898, 0.5981, 1.6500,
2.0000],
[ 20.6315, 3.3100, 3.5000, 1.8200, -18.5913, -0.1733, -3.1300,
2.0000],
[ 20.8761, 6.5100, 10.2500, 2.4100, -4.9920, 0.0291, 3.1200,
1.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000],
[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000]]])
where [ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000] is padding.
My forward pass looks like this
x = sensor_data[i].to('cuda')
mask = (x == 0) # Create padding mask
mask = mask.any(dim=2)
mask = torch.transpose(mask, 0, 1)
x = self.posEmbed(x)
x = self.inputEmbed(x)
encoded_sensor = self.encoder(x, mask)
And the encoder layer looks like this
class EncoderLayer(nn.Module):
def __init__(self, embed_dim, num_heads, feedforward_dim, dropout_prob=0.1):
super(EncoderLayer, self).__init__()
self.self_attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout_prob)
self.layer_norm1 = nn.LayerNorm(embed_dim)
self.feedforward = nn.Sequential(
nn.Linear(embed_dim, feedforward_dim),
nn.ReLU(),
nn.Linear(feedforward_dim, embed_dim),
)
self.layer_norm2 = nn.LayerNorm(embed_dim)
self.dropout = nn.Dropout(dropout_prob)
def forward(self, x, key_padding_mask):
# x: (seq_length, batch_size, embed_dim)
# self-attention
attn_output, _ = self.self_attention(x, x, x, key_padding_mask=key_padding_mask)
attn_output = self.dropout(attn_output)
x = self.layer_norm1(x + attn_output)
# feed-forward
ff_output = self.feedforward(x)
ff_output = self.dropout(ff_output)
x = self.layer_norm2(x + ff_output)
return x
After the forward pass through encoder, the places with padding becomes nan. But when I remove the key_padding_mask argument it works well. Please help me with this asap. thanks