Hi,

I am trying to run the following code and it gives `self.weight_mask.grad`

and `self.bias_mask.grad`

is `None`

I have checked many posts related to this `grad is None`

problem. From `un-differentiable function`

to `is_leaf`

and `retain_grad()`

. None of them works in this case.

```
class FC_DropConnect(nn.Module):
def __init__(self, dim, mlp_hidden_dim):
super().__init__()
'''
drop connections in MLP by multiply MLP.weight to a binary mask
'''
self.fc = nn.Linear(dim, mlp_hidden_dim)
self.weight_mask = nn.Parameter(torch.rand(self.fc.weight.shape))
torch.nn.init.normal_(self.weight_mask, std=.02)
self.bias_mask = nn.Parameter(torch.rand(self.fc.bias.shape))
torch.nn.init.normal_(self.bias_mask, std=.02)
self.binary = StraightThroughEstimator()
# forward for StraightThroughEstimator()
def forward(self, x):
self.fc.weight.data *= self.binary(self.weight_mask) #weight_mask
self.fc.bias.data *= self.binary(self.bias_mask) # bias_mask
x = self.fc(x)
return x
```

The Straight Through Estimator will act as a differentiable `binary activation layer`

.

```
class STEFunction(Function):
'''
https://discuss.pytorch.org/t/binary-activation-function-with-pytorch/56674/4
'''
@staticmethod
def forward(ctx, input):
return (input > 0).float()
@staticmethod
def backward(ctx, grad_output):
return F.hardtanh(grad_output)
class StraightThroughEstimator(nn.Module):
'''
https://discuss.pytorch.org/t/binary-activation-function-with-pytorch/56674/4
'''
def __init__(self):
super(StraightThroughEstimator, self).__init__()
def forward(self, x):
x = STEFunction.apply(x)
return x
```

In a minimal code that tests the Straight Through Estimator. The `grad`

of `nn.Paremeter`

is `not None`

```
##### 1st code: check grad StraightThroughEstimator ##########
x = nn.Parameter(torch.randn(5,3))
estimator = StraightThroughEstimator()
b = estimator(x) # b consists of 0 and 1
y = torch.randn(5,3)
y[1:3] = 1
b.backward(y)
print('y', y)
print('x.requires_grad',x.requires_grad) # True
print('x.grad', x.grad) # not None
##########################
x.requires_grad True
x.grad tensor([[ 0.3226, 1.0000, -0.0620],
[ 1.0000, 1.0000, 1.0000],
[ 1.0000, 1.0000, 1.0000],
[-0.7098, -0.1727, -0.2121],
[-1.0000, -0.7743, -0.1660]])
```

However, when I check `grad`

with the class `FC_DropConnect`

, the `self.weight_mask.grad`

is always `None`

no matter what I tried. The `fc.weight_mask.grad_fn is None`

```
##### 2nd code: check grad FC_DropConnect ##########
fc = FC_DropConnect(5, 3)
x = torch.randn(5)
y = torch.randn(3)
fc.zero_grad()
x_hat = fc(x)
print('fc.weight_mask.is_leaf', fc.weight_mask.is_leaf) # True
fc.weight_mask.retain_grad()
x_hat.backward(y, retain_graph=True)
print('fc.weight_mask', fc.weight_mask)
print('fc.weight_mask.requires_grad', fc.weight_mask.requires_grad) # True
print('fc.weight_mask.grad', fc.weight_mask.grad) # None
print('fc.weight_mask.grad_fn ', fc.weight_mask.grad_fn) # None
###################################
fc.weight_mask Parameter containing:
tensor([[-0.0390, 0.0063, 0.0014, -0.0140, 0.0211],
[-0.0297, -0.0519, 0.0291, 0.0076, -0.0237],
[-0.0107, 0.0175, -0.0200, 0.0033, 0.0198]], requires_grad=True)
fc.weight_mask.requires_grad True
fc.weight_mask.grad None
fc.weight_mask.grad_fn None
```

I don’t understand why the `1st code`

the nn.Parameters grad is not None and the `2nd code`

is None.

Thank you