```
import torch
import torch.nn as nn
x = torch.ones([1], requires_grad=True)
w = torch.tensor([0.2], requires_grad=True)
print('x====: {}'.format(x))
print('w====: {}'.format(w))
def f(x):
x = x.cuda()
return torch.pow(x, 2).sum()
# return x*x*x.sum()
def SGD(grad, lr=0.2):
return -lr*grad
def optimizer(grad):
return -w*grad
sum_losses = 0
for i in range(2):
loss = f(x)
# print(i, loss)
sum_losses += loss
loss.backward(torch.ones_like(loss), retain_graph=True)
print('x.grad: {}'.format(x.grad))
print('w1.grad: {}'.format(w.grad))
update = optimizer(x.grad)
x = x + update
print('x-:{}'.format(x))
print('x-.grad: {}'.format(x.grad))
x.retain_grad()
update.retain_grad()
sum_losses.backward()
print('w.grad: {}'.format(w.grad))
w_update = SGD(w.grad, lr=0.1)
w = w + w_update
print('w====: {}'.format(w))
```

Pytorch 0.4.1 print as follow:

```
x====: tensor([1.], requires_grad=True)
w====: tensor([0.2000], requires_grad=True)
x.grad: tensor([2.])
w1.grad: None
x-:tensor([0.6000], grad_fn=<ThAddBackward>)
x-.grad: None
x.grad: tensor([1.2000])
w1.grad: tensor([-3.8400])
x-:tensor([0.3600], grad_fn=<ThAddBackward>)
x-.grad: None
w.grad: tensor([-7.6800])
w====: tensor([0.9680], grad_fn=<ThAddBackward>)
```

Pytorch0.4.0 print as follow:

```
x====: tensor([ 1.])
w====: tensor([ 0.2000])
x.grad: tensor([ 2.])
w1.grad: None
x-:tensor([ 0.6000])
x-.grad: None
x.grad: tensor([ 1.2000])
w1.grad: tensor([-2.4000])
x-:tensor([ 0.3600])
x-.grad: None
w.grad: tensor([-6.2400])
w====: tensor([ 0.8240])
```

I change function optimizer as follow and the problem is solved, but still confused.

```
def optimizer(grad):
return w*(-grad)
```