The result obtained after an operation from a tensor obtained from pyTorch model does not have grad_fn and can not backward. I use .clone().detach() to the input and cast tensors of the compute_grad function shown below, So they should be disassociated from the previous model. However, the grad_fn of y is None, and the dy.sum().backward can not run.
Here is my code.
# All necessary imports at the beginning
import torch
import torchvision
import torch.nn as nn
from torch import Tensor
from torch.autograd import Variable
from torch.utils.data import DataLoader
# here is a custom grad_weight_computation function, the 'input' is the input of a specific layer in a pytorch model
# and the 'layer' is exactly the layer, the 'cast' is the ratio of grad_output to output, the grad_output is the
# partial derivative of loss wrt to output, the output is the output of the specific layer, in my case, the layer
# is always nn.Conv2d or nn.Linear
def compute_grad(input: Tensor, layer: nn.Linear, cast : Tensor):
x = input.clone().detach()
cast = cast.clone().detach()
x.requires_grad = True
# Create an identical linear layer without backward
func = nn.Linear(in_features=layer.in_features, out_features=layer.out_features, bias=True)
# Import parameter dictionary
func.load_state_dict(layer.state_dict())
y = func(x)
# The anomaly is: the grad_fn of y is None, but the attribute of is_leaf is True
print(y.grad_fn)
# out:None
print(y.is_leaf)
# out:True
dy = y * cast
dy.sum().backward()
dw = func.weight.grad
print(dw)
return dw
def register_for_hook(model):
for _, i in model.named_modules():
if isinstance(i, nn.Linear):
i.register_forward_hook(forward_hook)
i.register_backward_hook(backward_hook)
def forward_hook(module, input, output):
# the forward_hook is used to record the inputs, outputs and layers of all linear layers
# in a pytorch model during running.
# In order to reduce the amount of code, there is no need to list in detail
return
def backward_hook(module, grad_in, grad_out):
# the backward_hook is used to
# 1. Obtain the input, output and layer that we recorded in the forward_hook
# 2. Since the input and output were encrypted, so we decrypt them
# 3. Pass the input, layer and gout/output to our compute_grad func to compute real grad
# new_dw = compute_grad(input=input, layer=module, cast=grad_out/output)
# return new_dw to replace the original dw
return
Epoch=3
Batch_Size=50
LR=0.0001
num_classes=10
net = torchvision.models.vgg16(pretrained=True)
# register hook for the net
register_for_hook(net)
# datasets
trainData=torchvision.datasets.MNIST(
root="/home/lpx/codes/hook/data",
train=True,
transform=torchvision.transforms.ToTensor(),
download=True)
train_loader=DataLoader(dataset=trainData,batch_size=Batch_Size,shuffle=True)
test_data=torchvision.datasets.MNIST(root="home/lpx/codes/hook/data",train=False,download=True)
def Train(model):
# Run the regular training process, the loss will go throught the backward process
# and trigger the backward hook.
# loss.backward()
# In order to reduce the amount of code, there is no need to list in detail
return
Train(net)
The error message says
'''
Traceback (most recent call last):
File "vgg.py", line 159, in <module>
Train(net)
File "vgg.py", line 158, in Train
loss.backward()
File "/home/lpx/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/tensor.py", line 195, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/lpx/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/autograd/__init__.py", line 97, in backward
Variable._execution_engine.run_backward(
File "vgg.py", line 132, in backward_hook
new_dw = compute_grad(input, layer, cast)
File "vgg.py", line 114, in self_grad
dy.sum().backward()
File "/home/lpx/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/tensor.py", line 195, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/lpx/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/autograd/__init__.py", line 97, in backward
Variable._execution_engine.run_backward(
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
'''
And by contrast, if the input, layer and output are not obtained from a running pytorch model, instead they are created by users, the the same codes can run without error. You can see below:
import torch
from torch import nn, Tensor
from torch.autograd import Variable
def compute_grad(input: Tensor, layer, cast):
x = input.clone().detach()
cast = cast.clone().detach()
x.requires_grad = True
func = nn.Linear(in_features=layer.in_features, out_features=layer.out_features, bias=True)
func.load_state_dict(layer.state_dict())
y = func(x)
print(y.grad_fn)
# out: <AddmmBackward object at 0x7f5028a50fa0>
y = y.requires_grad_()
dy = y * cast
dy = dy.requires_grad_()
dy.sum().backward()
dw = func.weight.grad.data
return dw
input = torch.rand([50, 512], requires_grad=True)
m = nn.Linear(in_features=512, out_features=10, bias=True)
output = m(input)
grad_out = torch.randn_like(output)
# Simulate the real model running process through backward
output.sum().backward()
h = grad_out / output
new_dw = compute_grad(input, m, h)
print(new_dw)
# out: Tensor
The background of this problem is that we want to be able to encrypt the intermediate value when running a pytorch model. I think it’s very valuable. So if you can help me with this suspected bug I would be very grateful.
I have tried many versions of pytorch but got the same problem.