Hi,
I am trying to run the following code and it gives self.weight_mask.grad
and self.bias_mask.grad
is None
I have checked many posts related to this grad is None
problem. From un-differentiable function
to is_leaf
and retain_grad()
. None of them works in this case.
class FC_DropConnect(nn.Module):
def __init__(self, dim, mlp_hidden_dim):
super().__init__()
'''
drop connections in MLP by multiply MLP.weight to a binary mask
'''
self.fc = nn.Linear(dim, mlp_hidden_dim)
self.weight_mask = nn.Parameter(torch.rand(self.fc.weight.shape))
torch.nn.init.normal_(self.weight_mask, std=.02)
self.bias_mask = nn.Parameter(torch.rand(self.fc.bias.shape))
torch.nn.init.normal_(self.bias_mask, std=.02)
self.binary = StraightThroughEstimator()
# forward for StraightThroughEstimator()
def forward(self, x):
self.fc.weight.data *= self.binary(self.weight_mask) #weight_mask
self.fc.bias.data *= self.binary(self.bias_mask) # bias_mask
x = self.fc(x)
return x
The Straight Through Estimator will act as a differentiable binary activation layer
.
class STEFunction(Function):
'''
https://discuss.pytorch.org/t/binary-activation-function-with-pytorch/56674/4
'''
@staticmethod
def forward(ctx, input):
return (input > 0).float()
@staticmethod
def backward(ctx, grad_output):
return F.hardtanh(grad_output)
class StraightThroughEstimator(nn.Module):
'''
https://discuss.pytorch.org/t/binary-activation-function-with-pytorch/56674/4
'''
def __init__(self):
super(StraightThroughEstimator, self).__init__()
def forward(self, x):
x = STEFunction.apply(x)
return x
In a minimal code that tests the Straight Through Estimator. The grad
of nn.Paremeter
is not None
##### 1st code: check grad StraightThroughEstimator ##########
x = nn.Parameter(torch.randn(5,3))
estimator = StraightThroughEstimator()
b = estimator(x) # b consists of 0 and 1
y = torch.randn(5,3)
y[1:3] = 1
b.backward(y)
print('y', y)
print('x.requires_grad',x.requires_grad) # True
print('x.grad', x.grad) # not None
##########################
x.requires_grad True
x.grad tensor([[ 0.3226, 1.0000, -0.0620],
[ 1.0000, 1.0000, 1.0000],
[ 1.0000, 1.0000, 1.0000],
[-0.7098, -0.1727, -0.2121],
[-1.0000, -0.7743, -0.1660]])
However, when I check grad
with the class FC_DropConnect
, the self.weight_mask.grad
is always None
no matter what I tried. The fc.weight_mask.grad_fn is None
##### 2nd code: check grad FC_DropConnect ##########
fc = FC_DropConnect(5, 3)
x = torch.randn(5)
y = torch.randn(3)
fc.zero_grad()
x_hat = fc(x)
print('fc.weight_mask.is_leaf', fc.weight_mask.is_leaf) # True
fc.weight_mask.retain_grad()
x_hat.backward(y, retain_graph=True)
print('fc.weight_mask', fc.weight_mask)
print('fc.weight_mask.requires_grad', fc.weight_mask.requires_grad) # True
print('fc.weight_mask.grad', fc.weight_mask.grad) # None
print('fc.weight_mask.grad_fn ', fc.weight_mask.grad_fn) # None
###################################
fc.weight_mask Parameter containing:
tensor([[-0.0390, 0.0063, 0.0014, -0.0140, 0.0211],
[-0.0297, -0.0519, 0.0291, 0.0076, -0.0237],
[-0.0107, 0.0175, -0.0200, 0.0033, 0.0198]], requires_grad=True)
fc.weight_mask.requires_grad True
fc.weight_mask.grad None
fc.weight_mask.grad_fn None
I don’t understand why the 1st code
the nn.Parameters grad is not None and the 2nd code
is None.
Thank you