import torch
import torch.nn as nn
dtype = torch.float
device = torch.device(“cpu”)
N is batch size; D_in is input dimension;
H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 1, 1, 1, 1
Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
x[0,0]=1000
Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w1,0.5,0.5)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w2,1,1)
w3 = torch.randn(H, 1, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w3,1,1)
w1_ = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w1_,1,1)
w2_ = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w2_,1,1)
w3_ = torch.randn(H, 1, device=device, dtype=dtype, requires_grad=True)
nn.init.uniform_(w3_,1,1)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y using operations on Tensors;
p=torch.tanh(x.mm(w1))
y_pred = p.mm(w2)
y_pred-=y_pred.mean()
m=x.mm(w1).mm(w3)
y_pred+=m
y=y_pred.max(1, keepdim=True)[1]
y0=y_pred.gather(1,y)
y_pred_ = torch.tanh(x.mm(w1_)).mm(w2_)
y_pred_ -=y_pred_.mean()
y_pred_+=x.mm(w1_).mm(w3_)
y_=y_pred_.max(1, keepdim=True)[1]
y1=y_pred_.gather(1,y_)
# Compute and print loss using operations on Tensors.
loss = (y0 - y1).pow(2).sum()
if t % 100 == 99:
print(t, loss.item())
# Use autograd to compute the backward pass.
loss.backward()
# Manually update weights using gradient descent. Wrap in torch.no_grad()
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
w3 -= learning_rate * w3.grad
w1_ -= learning_rate * w1_.grad
w2_ -= learning_rate * w2_.grad
w3_ -= learning_rate * w3_.grad
# Manually zero the gradients after updating weights
w1.grad.zero_()
w2.grad.zero_()
w3.grad.zero_()
print(w1)
print(w2)
print(w3)
print(“tgt”)
print(w1_)
print(w2_)
print(w3_)
I tried with my calculator to compare my numbers results but they are different from what happens here, (I made a backprop by my hand, but the results are different). can someone tell how this happens, since I have two model networks, how did he manage to calculate the grads, they always different from my calculation. the worst is that it is nan why ?