Has no gradient in bias

olegkufa · November 20, 2024, 10:30pm

I wrote a layer in the EMA style for v at attention, but the gradient does not pass through the bias and I don’t understand what the problem is, since everything is fine according to the weights.
Can someone tell me what’s the problem

class ValueResidualLayer(nn.Module):
    def __init__(self, dim, n_heads):
        super().__init__()
        self.to_gates = nn.Linear(dim, n_heads, bias=True)
        self.act = nn.Sigmoid()
        self.h = n_heads
        

    def forward(self, x, v, res):
        
        gates = self.act(self.to_gates(x))
        
        gates = rearrange(gates, 'b n (h d) -> b h n d', h=self.h)

        out =  v * gates + (1 - gates) * res

        return out

ptrblck · November 21, 2024, 6:11am

I don’t know which inputs you are using but testing any random tensors works for me:

model = ValueResidualLayer(10, 10)
x = torch.randn(1, 10, 10)
v = torch.randn_like(x)
res = torch.randn_like(x)
out = model(x, v, res)
out.mean().backward()

print({name: p.grad for name, p in model.named_parameters()})
# {'to_gates.weight': tensor([[ 3.3208e-03,  1.7235e-03, -9.4052e-03,  2.4368e-03, -2.5074e-03,
#          -1.6132e-03,  3.9941e-03, -5.1434e-03,  3.4118e-03, -3.5914e-03],
#         [ 3.3302e-03,  2.2675e-03, -1.0479e-02,  2.4576e-03, -1.9038e-03,
#          -5.3038e-04,  3.9044e-03, -4.5693e-03,  4.7626e-03, -3.5518e-03],
#         [ 2.9834e-03,  2.3057e-03, -9.5416e-03,  2.0493e-03, -2.2837e-03,
#          -7.6640e-04,  3.9544e-03, -4.7697e-03,  4.6088e-03, -3.2332e-03],
#         [ 2.1757e-03,  2.4342e-03, -8.1161e-03,  1.5353e-03, -2.5703e-03,
#          -2.7348e-04,  3.4181e-03, -4.1390e-03,  5.2883e-03, -2.5688e-03],
#         [ 2.2552e-03,  2.4213e-03, -8.6571e-03,  1.8172e-03, -2.0862e-03,
#          -4.0808e-04,  3.5837e-03, -4.1661e-03,  5.4443e-03, -2.3937e-03],
#         [ 3.3070e-03,  2.4589e-03, -1.0083e-02,  2.0671e-03, -2.2453e-03,
#          -1.7378e-04,  4.0691e-03, -4.6217e-03,  4.6735e-03, -3.8617e-03],
#         [ 2.7600e-03,  3.3665e-03, -9.4790e-03,  2.0118e-03, -3.1184e-03,
#           1.3336e-04,  4.0494e-03, -4.4689e-03,  4.7932e-03, -4.1376e-03],
#         [ 3.3488e-03,  2.2973e-03, -9.7965e-03,  2.0508e-03, -2.2581e-03,
#          -5.0712e-04,  4.0553e-03, -4.8201e-03,  4.1354e-03, -3.8284e-03],
#         [ 3.3092e-03,  2.2325e-03, -9.7376e-03,  1.8958e-03, -1.5461e-03,
#           1.4049e-04,  4.0094e-03, -4.3248e-03,  4.7798e-03, -3.5617e-03],
#         [ 3.4868e-03,  1.9710e-03, -9.9802e-03,  2.1578e-03, -8.9903e-04,
#           8.1139e-05,  3.9524e-03, -3.9340e-03,  4.4010e-03, -3.4717e-03]]), 'to_gates.bias': tensor([0.0014, 0.0006, 0.0014, 0.0010, 0.0015, 0.0006, 0.0005, 0.0011, 0.0007,
#         0.0002])}

and shows valid gradients in the weight and bias.