I have this simple class but dW is None, even when the parameter is still being updated. After calling x.require_grad_(True), the hook is printing dW. This behavior is confusing

```
import torch.nn as nn
import torch.nn.functional as F
import torch
def hook_fn(module, grad_input, grad_output):
print(f"{module} has dW {grad_input[1]} and scaler value {module.scaler}")
def factor_balance(mid_blksz, out_blksz):
total = mid_blksz * out_blksz
class Scaler(nn.Module):
def __init__(self, out_features):
super().__init__()
self.scaler = nn.Parameter(torch.zeros(1))
def forward(self, x):
# x.requires_grad_(True)
x = self.scaler * x
# x = F.layer_norm(x, x.shape[1:])
# layernorm to avoid vanishing gradient
return x
x = torch.ones(100, 100, dtype=torch.float32)
y = torch.full((100, 100), 2, dtype=torch.float32)
model = Scaler(100)
model.register_backward_hook(hook_fn)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
for i in range(100):
loss = F.mse_loss(model(x), y)
loss.backward()
optimizer.step()
```

Thanks in advance!