Single Linear layer network, train with SGD, different result with Numpy implementation?

zchrissirhcz · September 9, 2021, 6:11am

I’m trying to implement a simple network (single linear layer, without activation function), and with SGD for training. The network input is also very simple, only 1 scalar for each iteration of training. With same network architecture and same learning rate, both using float64 type, my Numpy implementation is different with my torch implementation.

Can anyone help me? Wonder if I am using torch with mistakes. Thanks.

#!/usr/bin/env python
#coding: utf-8


import torch
import numpy as np

torch.set_default_tensor_type(torch.DoubleTensor)

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden):
        super(Net, self).__init__()
        self.fc = torch.nn.Linear(n_feature, n_hidden, bias=True)

    def forward(self, x):
            x = self.fc(x)
            return x

def print_net_param(net):
    """
    print net parameters
    """
    for name, param in net.named_parameters():
        if param.requires_grad:
            print(name, param.data)

# weight = torch.nn.Parameter(torch.ones_like(net.weight))
# print(weight)

def get_train_data():
    X = np.linspace(-1.0, 1.0, 3)
    Y = 2 * X + 3
    return X, Y

def train_torch():
    net = Net(n_feature=1, n_hidden=1)
    with torch.no_grad():
        net.fc.weight[0, 0] = 0.3
        net.fc.bias[0] = 0.1
    print_net_param(net)
    optimizer = torch.optim.SGD(net.parameters(), lr=0.1)
    loss_func = torch.nn.MSELoss()

    X, Y = get_train_data()

    for i in range(len(X)):
        x = X[i]
        y = Y[i]
        gt = torch.unsqueeze(torch.tensor([y]), dim=1)
        net_input = torch.unsqueeze(torch.tensor([x]), dim=1)
        net_output = net(net_input)
        loss = loss_func(net_output, gt)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            w = net.fc.weight.data.detach().numpy()
            b = net.fc.bias.data.detach().numpy()
            loss_value = loss.data.detach().numpy()

        print("input:", net_input.data.detach().numpy()[0,0], "pred:", net_output.data.detach().numpy()[0, 0], "loss:", loss_value, "w:", w, "b:", b)

class Neuron:
    def __init__(self, lr=0.1):
        self.w = np.ndarray((1, 1))
        self.b = np.ndarray((1, 1))
        self.lr = lr
    
    def forward(self, x):
        self.x = x
        y = x * self.w + self.b
        return y

    def backward(self, grad_y):
        # dL/dw = dL/dy * dy/dw
        # dL/db = dL/dy * dy/db
        #
        # dy/dw = x
        # dy/db = 1
        #
        grad_w = grad_y * self.x
        grad_b = grad_y

        self.w -= self.lr * grad_w
        self.b -= self.lr * grad_b

def train_numpy():
    neuron = Neuron(lr=0.1)
    neuron.w[0, 0] = 0.3
    neuron.b[0] = 0.1
    print("fc.weight:", neuron.w)
    print("fc.bias:", neuron.b)
    
    X, Y = get_train_data()
    for i in range(len(X)):
        x = X[i]
        gt = Y[i]
        output = neuron.forward(x)
        loss = (output - gt)**2
        grad_y = output - gt
        neuron.backward(grad_y)
        print("input:", x, "gt:", "pred:", output, "loss:", loss, "w:", neuron.w, "b:", neuron.b)

if __name__ == '__main__':
    print(">>> torch:")
    train_torch()
    print(">>> numpy:")
    train_numpy()

The output:

>>> torch:
fc.weight tensor([[0.3000]])
fc.bias tensor([0.1000])
input: -1.0 pred: -0.19999999999999998 loss: 1.44 w: [[0.06]] b: [0.34]
input: 0.0 pred: 0.34 loss: 7.0756000000000006 w: [[0.06]] b: [0.872]
input: 1.0 pred: 0.932 loss: 16.548623999999997 w: [[0.8736]] b: [1.6856]
>>> numpy:
fc.weight: [[0.3]]
fc.bias: [[0.1]]
input: -1.0 gt: pred: [[-0.2]] loss: [[1.44]] w: [[0.18]] b: [[0.22]]
input: 0.0 gt: pred: [[0.22]] loss: [[7.7284]] w: [[0.18]] b: [[0.498]]
input: 1.0 gt: pred: [[0.678]] loss: [[18.679684]] w: [[0.6122]] b: [[0.9302]]

huahuanZ · September 9, 2021, 10:00am

The numpy impl is wrong.

grad_y=2*(output-gt) # you miss the 2*

zchrissirhcz · September 9, 2021, 10:04am

Thank you. You’re right. Actually, in the very begging I use MSELoss = 0.5 * ( output - gt), then I remove 0.5 but didn’t update grad_y’s formula.