Backward got grad of none

Tony_Lee · August 5, 2020, 2:55pm

Hello, I use the backward to solve the grad, the demo code is as follows:

import torch


def angle2matrix(angles, device=torch.device('cpu')):
    angles = angles/180*3.1415926
    x = angles[0]
    y = angles[1]
    z = angles[2]

    # x
    Rx = torch.tensor([[1, 0, 0],
                   [0, torch.cos(x), -torch.sin(x)],
                   [0, torch.sin(x), torch.cos(x)]]).to(device)
    # y
    Ry = torch.tensor([[torch.cos(y), 0, torch.sin(y)],
                   [0, 1, 0],
                   [-torch.sin(y), 0, torch.cos(y)]]).to(device)
    # z
    Rz = torch.tensor([[torch.cos(z), -torch.sin(z), 0],
                   [torch.sin(z), torch.cos(z), 0],
                   [0, 0, 1]]).to(device)

    R = Rz.mm(Ry.mm(Rx))
    return R

x=torch.ones(3,requires_grad=True)
y=torch.pow(angle2matrix(x),2)
y.sum().backward()

print(x.grad)

then I got the error as follows:

Traceback (most recent call last):
  File "app/transform.py", line 110, in <module>
    y.sum().backward()
  File "/root/anaconda3/lib/python3.7/site-packages/torch/tensor.py", line 198, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/root/anaconda3/lib/python3.7/site-packages/torch/autograd/__init__.py", line 100, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

then I change the angle2matrix as follows:

def angle2matrix(angles, device=torch.device('cpu')):

    angles = angles/180*3.1415926
    x = angles[0]
    y = angles[1]
    z = angles[2]

    # x
    Rx = torch.tensor([[1, 0, 0],
                   [0, torch.cos(x), -torch.sin(x)],
                   [0, torch.sin(x), torch.cos(x)]]).to(device)
    # y
    Ry = torch.tensor([[torch.cos(y), 0, torch.sin(y)],
                   [0, 1, 0],
                   [-torch.sin(y), 0, torch.cos(y)]]).to(device)
    # z
    Rz = torch.tensor([[torch.cos(z), -torch.sin(z), 0],
                   [torch.sin(z), torch.cos(z), 0],
                   [0, 0, 1]]).to(device)

    Rx.requires_grad=True
    Ry.requires_grad=True
    Rz.requires_grad=True

    R = Rz.mm(Ry.mm(Rx))
    return R

x=torch.ones(3,requires_grad=True)
y=torch.pow(angle2matrix(x),2)
y.sum().backward()

print(x.grad)

but this time I the grad of x is none.

looking forward to your help !

Kushaj · August 5, 2020, 5:10pm

Your problem is this

x = torch.tensor([1., 2], requires_grad=True)
y = torch.tensor(x, requires_grad=True)
y.sum().backward()

Now when you backprop through y, y is a leaf (because torch.tensor creates leaf), there is no path to propagate the gradients to x i.e. why x.grad=None.

Kushaj · August 5, 2020, 5:14pm

One way to solve this problem is by doing something as follows

x = torch.tensor([1., 2.], requires_grad=True)
y = [x[0], x[1]]
(y[0]+y[1]).backward()

# Now gradients will be propagated to x

I don’t know if there is an easier way. But you need to convert Rx, Ry, Rx to lists and then work from there for your code to work.

Sagar_Dollin · March 1, 2022, 12:46pm

I’m facing a similar issue… I’m basically trying to pass two parameters for two matrices each. And then calculate loss using mse of y and predicted output. And backpropagate to the paparmeters. This is my code:

import torch

def get_device(gpu_no):
    if torch.cuda.is_available():
        return torch.device('cuda', gpu_no)
    else:
        return torch.device('cpu')


device = get_device(0)

params = torch.tensor(([[0.011], [0.012]]), requires_grad=True).to(device).to(torch.cfloat)

x_gate = torch.tensor([[1., 0.], [0., 1.]]).to(device)
y_gate = torch.tensor(([[0, -1j], [1j, 0]])).to(device)


def rx(theta):
    # co = torch.cos(theta / 2)
    # si = torch.sin(theta / 2)
    Rx_gate = torch.exp(-1j * (theta / 2) * x_gate).to(device).to(torch.cfloat).requires_grad_()

    return Rx_gate


def ry(theta):
    # co = torch.cos(theta / 2)
    # si = torch.sin(theta / 2)
    Ry_gate = torch.exp(-1j * (theta / 2) * y_gate).to(device).to(torch.cfloat).requires_grad_()

    return Ry_gate


x = torch.tensor([1., 0.]).to(device).to(torch.cfloat)
y = torch.tensor([0., 1.]).to(device).to(torch.cfloat)


def pred(params):
    out = rx(params[0]) @ x
    out = ry(params[1]) @ out
    return out

print("params        :", params)
print("prediction    :", pred(params))

loss = torch.pow((y - pred(params)), 2).sum()
print("loss          :", loss)

loss.backward()
print("loss grad     :", loss.grad)
print("params grad   :", params.grad)

The output I get is:

params        : tensor([[0.0110+0.j],
        [0.0120+0.j]], device='cuda:0', grad_fn=<ToCopyBackward0>)
prediction    : tensor([1.9940-0.0055j, 2.0060-0.0055j], device='cuda:0',
       grad_fn=<MvBackward0>)
loss          : tensor(4.9880-0.0331j, device='cuda:0', grad_fn=<SumBackward0>)
loss grad     : None
params grad   : None

I tried defining rx() and ry() in different ways like:

def rx(theta):
    co = torch.cos(theta / 2)
    si = torch.sin(theta / 2)
    Rx_gate = torch.stack([torch.cat([co, -si], dim=-1),
                          torch.cat([-si, co], dim=-1)], dim=-2).squeeze(0).to(device).to(torch.cfloat).requires_grad_()
    # Rx_gate = torch.exp(-1j * (theta / 2) * x_gate).to(device).to(torch.cfloat).requires_grad_()

    return Rx_gate


def ry(theta):
    co = torch.cos(theta / 2)
    si = torch.sin(theta / 2)
    Ry_gate = torch.stack([torch.cat([co, -si]),
                                torch.cat([si, co])], dim=-2).squeeze(0).to(device).to(torch.cfloat).requires_grad_()
    # Ry_gate = torch.exp(-1j * (theta / 2) * y_gate).to(device).to(torch.cfloat).requires_grad_()

    return Ry_gate

and like this:

def rx(theta):
    # co = torch.cos(theta / 2)
    # si = torch.sin(theta / 2)
    # Rx_gate = torch.stack([torch.cat([co, -si], dim=-1),
    #                       torch.cat([-si, co], dim=-1)], dim=-2).squeeze(0).to(device).to(torch.cfloat).requires_grad_()
    # Rx_gate = torch.exp(-1j * (theta / 2) * x_gate).to(device).to(torch.cfloat).requires_grad_()
    Rx_gate = torch.tensor(([[torch.cos(theta/2), -torch.sin(theta/2)],
                             [-torch.sin(theta/2), torch.cos(theta/2)]]), requires_grad=True).to(device).to(torch.cfloat)

    return Rx_gate


def ry(theta):
    # co = torch.cos(theta / 2)
    # si = torch.sin(theta / 2)
    # Ry_gate = torch.stack([torch.cat([co, -si]),
    #                             torch.cat([si, co])], dim=-2).squeeze(0).to(device).to(torch.cfloat).requires_grad_()
    # Ry_gate = torch.exp(-1j * (theta / 2) * y_gate).to(device).to(torch.cfloat).requires_grad_()
    Ry_gate = torch.tensor(([[torch.cos(theta / 2), -torch.sin(theta / 2)],
                        [torch.sin(theta / 2), torch.cos(theta / 2)]]), requires_grad=True).to(device).to(torch.cfloat)

    return Ry_gate

all these give the same result. can someone please explain why I’m not getting the grad of press. Even though in the output it has grad_fn=

ptrblck · March 1, 2022, 11:52pm

params = torch.tensor(([[0.011], [0.012]]), requires_grad=True).to(device).to(torch.cfloat)

will create a non-leaf tensor, since you are calling to() on it, which is differentiable.
Specify the device and dtype in the tensor constructor or create a new leaf tensor via a = params.detach().requires_grad_().

I don’t know how you are checking the loss gradient, but by default it won’t be set since you are implicitly (as 1.) or explicitly passing the gradient to the loss in the backward() call.