Always similar output when batch_size more than 1

I recently struggled with result that when train with batch size more than 1, my loss could not reduce a lot as the output from mini_batch seems always get similar output while the label is quite different. I prototype this with a simple model and get similar problem, I wonder if anyone know what’s wrong with my code, or should we always train with same labels for one batch. thanks!

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class TestDataSet(Dataset):
    def  __init__(self):
        self.data_list = torch.rand((6, 1, 4, 4), dtype=torch.float32)
        self.groundtruth_list = torch.arange(6, dtype=torch.float32) * 10
    
    def __len__(self):
        return 6
    
    def __getitem__(self, idx):
        data = self.data_list[idx]
        groundtruth = self.groundtruth_list[idx]        
        return data, groundtruth

trainset = TestDataSet()
trainloader = DataLoader(dataset=trainset, batch_size=3, shuffle=False, num_workers=0)

class TestNet(nn.Module):
    def __init__(self):
        super(TestNet, self).__init__()
        
        self.feature = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size = 3, stride=1, padding=1), nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size = 3, stride=1, padding=1), nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size = 3, stride=1, padding=1), nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size = 3, stride=1, padding=1), nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size = 3, stride=1, padding=1), nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, kernel_size = 3, stride=1, padding=1), nn.ReLU(inplace=True))
        self.linear = nn.Linear(32*4*4, 1)
        
        self.__initialize()
        
    def __initialize(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
        
    def forward(self, x):
        x = self.feature(x)
        x = x.view(-1, 512)
        x = self.linear(x)
        return x
    
net = TestNet()
print(net)

net = TestNet()

import torch.optim as optim
criterion = nn.L1Loss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

num_epochs = 1000

for epoch in range(num_epochs):
    
    running_loss = 0.0

    for batch_idx, (inputs, labels) in enumerate(trainloader):
        optimizer.zero_grad()
    
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        
        
        if epoch % 100 == 99:
            print("epoch {} output = {}".format(epoch, outputs))
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    running_loss = running_loss / 2
       
    if epoch % 100 == 99:
        print("epoch {} running loss = {:.4f}".format(epoch, running_loss))

What I get is:

epoch 99 output = tensor([[23.8904],
[28.7085],
[20.6641]], grad_fn=)
epoch 99 output = tensor([[30.4899],
[22.9630],
[27.6583]], grad_fn=)
epoch 99 running loss = 9.1643
epoch 199 output = tensor([[13.0002],
[11.1635],
[11.0956]], grad_fn=)
epoch 199 output = tensor([[41.1716],
[40.6116],
[41.1826]], grad_fn=)
epoch 199 running loss = 4.7491
epoch 299 output = tensor([[ 9.9920],
[10.1905],
[10.2471]], grad_fn=)
epoch 299 output = tensor([[40.6160],
[40.2038],
[40.5667]], grad_fn=)
epoch 299 running loss = 4.5123
epoch 399 output = tensor([[10.2249],
[10.2265],
[10.1220]], grad_fn=)
epoch 399 output = tensor([[39.8608],
[40.2258],
[40.0353]], grad_fn=)
epoch 399 running loss = 4.4805
epoch 499 output = tensor([[8.8487],
[8.5307],
[8.9612]], grad_fn=)
epoch 499 output = tensor([[35.7484],
[36.2212],
[36.3786]], grad_fn=)
epoch 499 running loss = 5.0115
epoch 599 output = tensor([[8.9839],
[8.8247],
[9.0632]], grad_fn=)
epoch 599 output = tensor([[39.2804],
[38.3255],
[38.4358]], grad_fn=)
epoch 599 running loss = 4.7069
epoch 699 output = tensor([[9.7793],
[9.4635],
[9.6627]], grad_fn=)
epoch 699 output = tensor([[38.4204],
[38.4527],
[38.7851]], grad_fn=)
epoch 699 running loss = 4.6458
epoch 799 output = tensor([[9.9077],
[9.6871],
[9.7271]], grad_fn=)
epoch 799 output = tensor([[38.8115],
[39.2431],
[39.2638]], grad_fn=)
epoch 799 running loss = 4.5689
epoch 899 output = tensor([[10.0550],
[ 9.9876],
[10.0641]], grad_fn=)
epoch 899 output = tensor([[41.8334],
[41.6715],
[41.7007]], grad_fn=)
epoch 899 running loss = 4.6421
epoch 999 output = tensor([[9.9938],
[9.9625],
[9.9120]], grad_fn=)
epoch 999 output = tensor([[40.6939],
[40.7212],
[40.7685]], grad_fn=)
epoch 999 running loss = 4.5302

Write some keras code which get correct result

from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers import Conv2D
from keras.layers.core import Dense
from keras.layers import Dense, Dropout, Flatten

x_train = np.random.random((6, 4, 4, 1))
y_train = np.arange(6)

model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(4, 4, 1), padding='same'))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(Flatten())
model.add(Dense(1, input_dim = 32*4*4))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_squared_error', optimizer=sgd)
model.fit(x_train, y_train, batch_size=3, epochs=500)

Result:

Epoch 1/10
6/6 [==============================] - 5s 837ms/step - loss: 7.9785
Epoch 2/10
6/6 [==============================] - 0s 1ms/step - loss: 2.5833
Epoch 3/10
6/6 [==============================] - 0s 1ms/step - loss: 5.8242
Epoch 4/10
6/6 [==============================] - 0s 983us/step - loss: 5.4813
Epoch 5/10
6/6 [==============================] - 0s 951us/step - loss: 3.0968
Epoch 6/10
6/6 [==============================] - 0s 1ms/step - loss: 2.9960
Epoch 7/10
6/6 [==============================] - 0s 986us/step - loss: 1.4687
Epoch 8/10
6/6 [==============================] - 0s 976us/step - loss: 1.2033
Epoch 9/10
6/6 [==============================] - 0s 1ms/step - loss: 0.5037
Epoch 10/10
6/6 [==============================] - 0s 943us/step - loss: 0.2996