import torch

import torch.nn as nn

dtype = torch.float

device = torch.device(“cpu”)

# N is batch size; D_in is input dimension;

# H is hidden dimension; D_out is output dimension.

N, D_in, H, D_out = 1, 1, 1, 1

# Create random Tensors to hold input and outputs.

x = torch.randn(N, D_in, device=device, dtype=dtype)

x[0,0]=1000

# Create random Tensors for weights.

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)

nn.init.uniform_(w1,0.5,0.5)

w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

nn.init.uniform_(w2,1,1)

w3 = torch.randn(H, 1, device=device, dtype=dtype, requires_grad=True)

nn.init.uniform_(w3,1,1)

w1_ = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)

nn.init.uniform_(w1_,1,1)

w2_ = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

nn.init.uniform_(w2_,1,1)

w3_ = torch.randn(H, 1, device=device, dtype=dtype, requires_grad=True)

nn.init.uniform_(w3_,1,1)

learning_rate = 1e-6

for t in range(500):

# Forward pass: compute predicted y using operations on Tensors;

```
p=torch.tanh(x.mm(w1))
y_pred = p.mm(w2)
y_pred-=y_pred.mean()
m=x.mm(w1).mm(w3)
y_pred+=m
y=y_pred.max(1, keepdim=True)[1]
y0=y_pred.gather(1,y)
y_pred_ = torch.tanh(x.mm(w1_)).mm(w2_)
y_pred_ -=y_pred_.mean()
y_pred_+=x.mm(w1_).mm(w3_)
y_=y_pred_.max(1, keepdim=True)[1]
y1=y_pred_.gather(1,y_)
# Compute and print loss using operations on Tensors.
loss = (y0 - y1).pow(2).sum()
if t % 100 == 99:
print(t, loss.item())
# Use autograd to compute the backward pass.
loss.backward()
# Manually update weights using gradient descent. Wrap in torch.no_grad()
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
w3 -= learning_rate * w3.grad
w1_ -= learning_rate * w1_.grad
w2_ -= learning_rate * w2_.grad
w3_ -= learning_rate * w3_.grad
# Manually zero the gradients after updating weights
w1.grad.zero_()
w2.grad.zero_()
w3.grad.zero_()
```

print(w1)

print(w2)

print(w3)

print(“tgt”)

print(w1_)

print(w2_)

print(w3_)

I tried with my calculator to compare my numbers results but they are different from what happens here, (I made a backprop by my hand, but the results are different). can someone tell how this happens, since I have two model networks, how did he manage to calculate the grads, they always different from my calculation. the worst is that it is nan why ?