When creating a Model i came across the effect that the Model would always converge to a state, where every sample in a batch would have the same output for the Model independent of the actual label. But with a batchsize of 1 the Model does also not converge/overfit.
Example of what i mean with batch_size = 4 after 783 epochs with only 10 samples (so the Model should overfit in my opinion):
epoch: 783 step: 5/5 loss = 0.701137363910675
Correct Labels:
tensor([[0., 1.],
[1., 0.]])
Outputs of Model:
tensor([[-0.0208, 0.2322],
[-0.0208, 0.2322]], grad_fn=<AddmmBackward0>)
What i already tried:
- Increased/Decreased number of Layers
- Increased/Decreased Learning rate
- Changed from CrossEntropyLoss to BCE (thats why in the below example i only have two different labels, in my Original Model i had more Labels, thats why i used CrossEntropyLoss)
- verified that the dataset works correctly (in the minimal Example below its just example data but still creates the Problem i have) (Original data is also much more complex and divers
- reduced Feature_size (again in the minimal Example below are already reduced to only Feature_size of 10 and still has the issue)
- eliminated some other complexities not present in the minimalistic Version below
- reduced samples so the Model should overfit (again see Version below)
I’m pretty new to Pytorch, so its very likely i made some simple mistake.
Here i have the most minimalistic Version i could create of my Problem, what am i missing?
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import math
class Dataset(Dataset):
def __getitem__(self, index):
if index == 0:
feature_vector = [178.0500, 178.3200, 177.7100, 178.2300, 177.2700, 176.5800, 177.8900,
177.4400, 177.3900, 177.2600]
label = [1., 0.]
if index == 1:
feature_vector = [178.3200, 177.7100, 178.2300, 177.2700, 176.5800, 177.8900, 177.4400,
177.3900, 177.2600, 178.4900]
label = [0., 1.]
if index == 2:
feature_vector = [177.7100, 178.2300, 177.2700, 176.5800, 177.8900, 177.4400, 177.3900,
177.2600, 178.4900, 178.6800]
label = [0., 1.]
if index == 3:
feature_vector = [178.2300, 177.2700, 176.5800, 177.8900, 177.4400, 177.3900, 177.2600,
178.4900, 178.6800, 178.4900]
label = [0., 1.]
if index == 4:
feature_vector = [177.2700, 176.5800, 177.8900, 177.4400, 177.3900, 177.2600, 178.4900,
178.6800, 178.4900, 178.5100]
label = [0., 1.]
if index == 5:
feature_vector = [176.5800, 177.8900, 177.4400, 177.3900, 177.2600, 178.4900, 178.6800,
178.4900, 178.5100, 175.9700]
label = [0., 1.]
if index == 6:
feature_vector = [177.8900, 177.4400, 177.3900, 177.2600, 178.4900, 178.6800, 178.4900,
178.5100, 175.9700, 176.7300]
label = [1., 0.]
if index == 7:
feature_vector = [177.4400, 177.3900, 177.2600, 178.4900, 178.6800, 178.4900, 178.5100,
175.9700, 176.7300, 176.8200]
label = [0., 1.]
if index == 8:
feature_vector = [177.3900, 177.2600, 178.4900, 178.6800, 178.4900, 178.5100, 175.9700,
176.7300, 176.8200, 177.4800]
label = [0., 1.]
if index == 9:
feature_vector = [177.2600, 178.4900, 178.6800, 178.4900, 178.5100, 175.9700, 176.7300,
176.8200, 177.4800, 175.4200]
label = [1., 0.]
feature_vector = torch.tensor(feature_vector, dtype=torch.float32, device="cpu")
label = torch.tensor(label, dtype=torch.float32, device="cpu")
return feature_vector, label
def __len__(self):
length = 10
return length
class LinNet(nn.Module):
def __init__(self):
super(LinNet, self).__init__()
self.lin1 = nn.Linear(10, 10)
self.lin2 = nn.Linear(10, 10)
self.lin3 = nn.Linear(10, 10)
self.lin4 = nn.Linear(10, 10)
self.lin5 = nn.Linear(10, 10)
self.lin6 = nn.Linear(10, 10)
self.lin7 = nn.Linear(10, 10)
self.lin8 = nn.Linear(10, 10)
self.lin9 = nn.Linear(10, 10)
self.lin10 = nn.Linear(10, 2)
def forward(self, x):
x = torch.relu(self.lin1(x))
x = torch.relu(self.lin2(x))
x = torch.relu(self.lin3(x))
x = torch.relu(self.lin4(x))
x = torch.relu(self.lin5(x))
x = torch.relu(self.lin6(x))
x = torch.relu(self.lin7(x))
x = torch.relu(self.lin8(x))
x = torch.relu(self.lin9(x))
x = self.lin10(x)
return x
if __name__ == '__main__':
batch_size = 4
dataset = Dataset()
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=1)
model = LinNet()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
num_epochs = 10000
total_samples = len(dataset)
total_iterations = math.ceil(total_samples / batch_size)
for epoch in range(num_epochs):
for i, data in enumerate(dataloader):
# get the inputs
inputs, labels = data
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
# for name, param in model.named_parameters():
# print(name, param.grad.abs().sum())
optimizer.step()
# print statistics
print("Labels:")
print(labels)
print("Outputs:")
print(outputs)
print(f"epoch: {epoch} step: {i + 1}/{total_iterations} loss = {loss.item()}")