Scaled_dot_product_attention with padding mask output

Hi,

I tried to use scaled_dot_product_attention to compute the attention.
And the query is

# Query:
tensor([[[[-0.8440,  1.0418,  0.5483, -0.7435],
          [ 0.4904, -0.1895, -0.7027, -0.3302],
          [ 0.7492, -0.6846,  0.0066,  0.7599],
          [ 0.3074, -0.8886,  0.2787,  1.2103],
          [ 0.8353, -1.0342, -0.5520,  0.7324],
          [-0.1721,  0.7180, -0.3956, -1.1090],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000]]]], grad_fn=<ViewBackward0>)
# Key:
tensor([[[[ 0.1548,  0.7600,  0.3218,  0.9137],
          [-1.3435, -0.0584, -1.0111, -0.8258],
          [ 0.5090, -0.1811, -0.2807, -0.5397],
          [ 1.4941, -0.7430,  0.8974,  0.0333],
          [-0.1672, -0.7598, -0.3205, -0.9097],
          [-1.5701,  0.6056, -0.9636, -0.1972],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000]]]], grad_fn=<ViewBackward0>)
# Value:
tensor([[[[ 0.8525,  0.0043, -1.1249,  0.5562],
          [-0.3646, -0.3514,  1.2640, -1.0007],
          [-1.4638,  0.0612,  0.6503,  0.7319],
          [-0.4793,  0.4287, -0.3456,  0.6454],
          [-0.8312, -0.0062,  1.1207, -0.5732],
          [ 0.3769, -0.4384,  0.5523, -0.8017],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000]]]], grad_fn=<ViewBackward0>)
# Padding mask:
tensor([[[[0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf]]]])

where the padding embedding is [ 0.0000, 0.0000, 0.0000, 0.0000].
I expect that in the attention output, the padding part should be zero, but I got

# Output attention
tensor([[[[-1.0690e-01, -2.5148e-01,  4.8336e-01, -3.1793e-01],
          [ 2.8167e-02,  1.4441e-01, -3.6452e-01,  3.3902e-01],
          [ 3.4306e-02, -5.0814e-02,  7.4292e-02, -9.6214e-02],
          [-5.4292e-01,  1.7309e-01,  3.2902e-01,  2.0812e-02],
          [-4.2501e-01,  2.2591e-01,  4.8074e-02,  2.2316e-01],
          [-3.8819e-01, -1.2892e-01,  8.0144e-01, -5.3697e-01],
          [-7.9914e-01,  1.5077e-02,  6.5923e-01,  7.7907e-04],
          [ 1.2565e-01, -1.4613e-01,  1.8411e-01, -2.6724e-01],
          [-6.0949e-01, -9.6750e-02,  6.3809e-01, -8.9613e-02],
          [-5.2208e-01,  1.7143e-02,  2.8566e-01,  1.9183e-01],
          [ 1.3274e-01, -1.4676e-01,  1.8270e-01, -2.7289e-01],
          [-5.5838e-01,  2.3688e-02,  6.7969e-01, -3.0950e-01],
          [-4.8085e-01,  1.9761e-02,  2.1536e-01,  2.3830e-01],
          [ 1.2565e-01, -1.4613e-01,  1.8411e-01, -2.6724e-01],
          [-1.5141e-01, -1.4820e-01,  5.5768e-01, -4.5829e-01],
          [-1.5978e-01,  1.4289e-01, -1.1521e-01,  2.1512e-01],
          [-2.3792e-01,  1.8583e-02, -8.9319e-02,  3.7724e-01],
          [-4.7675e-01, -2.4351e-01,  8.2079e-01, -3.6250e-01],
          [-2.7296e-01, -2.6534e-01,  9.7901e-01, -7.9186e-01],
          [-2.7031e-02, -3.8693e-03,  6.7497e-02, -5.7770e-02]]]],
       grad_fn=<UnsafeViewBackward0>)

I read the scaled_dot_product_attention python comment which mentions that

if attn_mask is not None:
    if attn_mask.dtype == torch.bool:
        attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
    else:
        attn_bias += attn_mask

where attn_bias += attn_mask requres the same dimension among attn_bias and attn_mask, however, the attn_mask is torch.Size([1, 1, 1, 20]) and attn_bias is torch.zeros(L, S, dtype=query.dtype) in this case it’s torch.Size([20, 20]), how that could be added?

Thanks!