I’m trying to implement a simple network (single linear layer, without activation function), and with SGD for training. The network input is also very simple, only 1 scalar for each iteration of training. With same network architecture and same learning rate, both using float64 type, my Numpy implementation is different with my torch implementation.
Can anyone help me? Wonder if I am using torch with mistakes. Thanks.
#!/usr/bin/env python
#coding: utf-8
import torch
import numpy as np
torch.set_default_tensor_type(torch.DoubleTensor)
class Net(torch.nn.Module):
def __init__(self, n_feature, n_hidden):
super(Net, self).__init__()
self.fc = torch.nn.Linear(n_feature, n_hidden, bias=True)
def forward(self, x):
x = self.fc(x)
return x
def print_net_param(net):
"""
print net parameters
"""
for name, param in net.named_parameters():
if param.requires_grad:
print(name, param.data)
# weight = torch.nn.Parameter(torch.ones_like(net.weight))
# print(weight)
def get_train_data():
X = np.linspace(-1.0, 1.0, 3)
Y = 2 * X + 3
return X, Y
def train_torch():
net = Net(n_feature=1, n_hidden=1)
with torch.no_grad():
net.fc.weight[0, 0] = 0.3
net.fc.bias[0] = 0.1
print_net_param(net)
optimizer = torch.optim.SGD(net.parameters(), lr=0.1)
loss_func = torch.nn.MSELoss()
X, Y = get_train_data()
for i in range(len(X)):
x = X[i]
y = Y[i]
gt = torch.unsqueeze(torch.tensor([y]), dim=1)
net_input = torch.unsqueeze(torch.tensor([x]), dim=1)
net_output = net(net_input)
loss = loss_func(net_output, gt)
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
w = net.fc.weight.data.detach().numpy()
b = net.fc.bias.data.detach().numpy()
loss_value = loss.data.detach().numpy()
print("input:", net_input.data.detach().numpy()[0,0], "pred:", net_output.data.detach().numpy()[0, 0], "loss:", loss_value, "w:", w, "b:", b)
class Neuron:
def __init__(self, lr=0.1):
self.w = np.ndarray((1, 1))
self.b = np.ndarray((1, 1))
self.lr = lr
def forward(self, x):
self.x = x
y = x * self.w + self.b
return y
def backward(self, grad_y):
# dL/dw = dL/dy * dy/dw
# dL/db = dL/dy * dy/db
#
# dy/dw = x
# dy/db = 1
#
grad_w = grad_y * self.x
grad_b = grad_y
self.w -= self.lr * grad_w
self.b -= self.lr * grad_b
def train_numpy():
neuron = Neuron(lr=0.1)
neuron.w[0, 0] = 0.3
neuron.b[0] = 0.1
print("fc.weight:", neuron.w)
print("fc.bias:", neuron.b)
X, Y = get_train_data()
for i in range(len(X)):
x = X[i]
gt = Y[i]
output = neuron.forward(x)
loss = (output - gt)**2
grad_y = output - gt
neuron.backward(grad_y)
print("input:", x, "gt:", "pred:", output, "loss:", loss, "w:", neuron.w, "b:", neuron.b)
if __name__ == '__main__':
print(">>> torch:")
train_torch()
print(">>> numpy:")
train_numpy()
The output:
>>> torch:
fc.weight tensor([[0.3000]])
fc.bias tensor([0.1000])
input: -1.0 pred: -0.19999999999999998 loss: 1.44 w: [[0.06]] b: [0.34]
input: 0.0 pred: 0.34 loss: 7.0756000000000006 w: [[0.06]] b: [0.872]
input: 1.0 pred: 0.932 loss: 16.548623999999997 w: [[0.8736]] b: [1.6856]
>>> numpy:
fc.weight: [[0.3]]
fc.bias: [[0.1]]
input: -1.0 gt: pred: [[-0.2]] loss: [[1.44]] w: [[0.18]] b: [[0.22]]
input: 0.0 gt: pred: [[0.22]] loss: [[7.7284]] w: [[0.18]] b: [[0.498]]
input: 1.0 gt: pred: [[0.678]] loss: [[18.679684]] w: [[0.6122]] b: [[0.9302]]