Hi there,
I’ve seen similar posts, but these haven’t addressed the following issue.
I’m experiencing None gradients in a custom layer I have written, which is used as a part of a bigger model. I have checked the gradients of the parameters right after initialization (with pycharm’s debugger), and also during the forward pass, and in both cases the .requires_grad=True, but .grad=None.
I’m new to writing custom layers, so I apologize in advance if this question is trivial.
Should more information be needed to answer, please let me know.
The layer I have written:
import numpy as np
import torch
import torch.nn.functional as F
class HardSelfAttentionLayer(torch.nn.Module):
def __init__(self, Q_shape, K_shape):
super().__init__()
assert len(Q_shape) == 3, f'Unsupported Q tensor dim {len(Q_shape)}'
assert len(K_shape) == 3, f'Unsupported K tensor dim {len(K_shape)}'
assert Q_shape[1:] == K_shape[1:], f'1st and 2nd Dimensions must match'
self.Q_shape = Q_shape
self.K_shape = K_shape
self.k = self.K_shape[0]
self.q = self.Q_shape[0]
self.n = self.Q_shape[1] * self.Q_shape[2]
self.w1 = torch.nn.Parameter(torch.randn(size=[self.n],
requires_grad=True))
self.w2 = torch.nn.Parameter(torch.randn(size=[self.q],
requires_grad=True))
self.att_th = torch.nn.Parameter(torch.tensor(0.5))
self.sigmoid = torch.nn.Sigmoid()
def mats2vecs(self, inp):
return inp.transpose(1, 2).flatten(start_dim=2, end_dim=3)
def vecs2mats(self, inp):
return torch.stack(inp.split(self.K_shape[1], dim=2), dim=1)
def forward(self, Q, K):
"""
In the forward function we accept a Tensor of input data and we must return
a Tensor of output data. We can use Modules defined in the constructor as
well as arbitrary operators on Tensors.
"""
# we first convert the 3d tensors to 2d tensors, by flattening
Q = self.mats2vecs(Q) # (BS, self.q, self.n)
K = self.mats2vecs(K) # (BS, self.k, self.n)
# we create D, a diagonal matrix with w1 values
D = self.w1.diag_embed() # (self.n, self.n)
# we multiply Q*D*K^T
QDK = Q.matmul(D).matmul(K.transpose(1, 2)) # (BS, self.q, self.k)
# we normalize QDK to obtain the Similarity matrix
S = QDK.div(np.sqrt(self.n)) # (BS, self.q, self.k)
# we apply softmax activation
act_S = F.relu(S) # (BS, self.q, self.k)
# we calculate the Importance matrix
I = (act_S.transpose(1, 2)).matmul(self.w2).div(np.sqrt(self.q))
# (BS, self.k)
# we apply the softmax activation and reshape
w3 = self.sigmoid(I).reshape(-1, self.k, 1) # (BS, self.k)
# we attend to w3, by multiplying
K[torch.broadcast_tensors(w3 < self.att_th, K)[0]] = -1.0
# (BS, self.k, self.n)
return self.vecs2mats(K)