Simple test, I really didn't get how autograd calculate these grads

import torch
import torch.nn as nn

dtype = torch.float
device = torch.device(“cpu”)

N is batch size; D_in is input dimension;

H is hidden dimension; D_out is output dimension.

N, D_in, H, D_out = 1, 1, 1, 1

Create random Tensors to hold input and outputs.

x = torch.randn(N, D_in, device=device, dtype=dtype)

x[0,0]=1000

Create random Tensors for weights.

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w1,0.5,0.5)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w2,1,1)
w3 = torch.randn(H, 1, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w3,1,1)

w1_ = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w1_,1,1)
w2_ = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w2_,1,1)
w3_ = torch.randn(H, 1, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w3_,1,1)

learning_rate = 1e-6

for t in range(500):
# Forward pass: compute predicted y using operations on Tensors;

p=torch.tanh(x.mm(w1))
y_pred = p.mm(w2)
y_pred-=y_pred.mean()
m=x.mm(w1).mm(w3)
y_pred+=m
y=y_pred.max(1, keepdim=True)[1]
y0=y_pred.gather(1,y)

y_pred_ = torch.tanh(x.mm(w1_)).mm(w2_)
y_pred_ -=y_pred_.mean()
y_pred_+=x.mm(w1_).mm(w3_)
y_=y_pred_.max(1, keepdim=True)[1]
y1=y_pred_.gather(1,y_)

# Compute and print loss using operations on Tensors.

loss = (y0 - y1).pow(2).sum()
if t % 100 == 99:
    print(t, loss.item())

# Use autograd to compute the backward pass. 

loss.backward()

# Manually update weights using gradient descent. Wrap in torch.no_grad()

with torch.no_grad():

    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad
    w3 -= learning_rate * w3.grad

    w1_ -= learning_rate * w1_.grad
    w2_ -= learning_rate * w2_.grad
    w3_ -= learning_rate * w3_.grad

    # Manually zero the gradients after updating weights

    w1.grad.zero_()
    w2.grad.zero_()
    w3.grad.zero_()

print(w1)
print(w2)
print(w3)

print(“tgt”)

print(w1_)
print(w2_)
print(w3_)

I tried with my calculator to compare my numbers results but they are different from what happens here, (I made a backprop by my hand, but the results are different). can someone tell how this happens, since I have two model networks, how did he manage to calculate the grads, they always different from my calculation. the worst is that it is nan why ?