Differences in MSELoss calculations

Hello,
I’m trying to understand the calculations and manually calculate this simple network.
All calculations are good except for the weight.
What am I doing wrong?

import torch
import torch.nn as nn
import torch.optim as optim

weight_0 = 0.25
bias_0 = 0.68
l_rate = 0.01

input_data = torch.Tensor([[2.2], [4.0]])
target_data = torch.Tensor([[4.1], [5.1]])

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(1, 1)
        self.fc1.weight.data = torch.tensor([[weight_0]])
        self.fc1.bias.data = torch.tensor([bias_0])

    def forward(self, x):
        x = self.fc1(x)
        return x

net = Net()

loss_f = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr = l_rate)

#net.train() #one epoch
net_out = net(input_data)
loss = loss_f(net_out, target_data)
optimizer.zero_grad()
loss.backward()
optimizer.step()
net.eval()

print("loss:  ", loss.data) # 9.9666
print("weight:", net.fc1.weight.data, net.fc1.weight.grad.data) # 0.4499, -19.9940
print("bias:  ", net.fc1.bias.data, net.fc1.bias.grad.data) # 0.7429, -6.2900

print("calculations:")

in_mean = (input_data[0] + input_data[1]) / 2

out_1 = (input_data[0] * weight_0 + bias_0)
out_2 = (input_data[1] * weight_0 + bias_0)

loss_1 = (out_1 - target_data[0]) ** 2
loss_2 = (out_2 - target_data[1]) ** 2
loss_out = (loss_1 + loss_2) / 2

loss_d_1 = (out_1 - target_data[0]) * 2
loss_d_2 = (out_2 - target_data[1]) * 2
loss_d_out = (loss_d_1 + loss_d_2) / 2

weight = weight_0 - loss_d_out * l_rate * in_mean
bias = bias_0 - loss_d_out * l_rate

print("loss:", loss_out) # 9.9666
print("loss_d:", loss_d_out) # -6.2900
print("weight:", weight) # 0.4450 (should be 0.4499)
print("bias:", bias) # 0.7429

print(net.fc1.weight.data, "-", weight, "=", net.fc1.weight.data-weight.data) # 0.4499 - 0.4450 = 0.0050

Thanks!

It’s because you should have separated the computations for each element of you batch!
Here, instead of:

loss_d_out = (loss_d_1 + loss_d_2) / 2
weight = weight_0 - l_rate * loss_d_out * in_mean
bias = bias_0 - l_rate * loss_d_out

it should rather be:

weight = weight_0 - l_rate * (loss_d_1 * in_1 + loss_d_2 * in_2) / 2
bias = bias_0 - l_rate * (loss_d_1 + loss_d_2) / 2

(Though it doesn’t change anything for the bias)

It works. Many thanks!