Training and Validation Loss Too High and Not Converging

I am trying to calculate training and validation loss however I am getting an extremely high amount that is not converging

‘’'transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])

batch_size = 32

cifar10 = torchvision.datasets.CIFAR10(root=‘./data’, download=True, transform=torchvision.transforms.ToTensor())
pivot = 40000
cifar10 = sorted(cifar10, key=lambda x: x[1])
train_set = torch.utils.data.Subset(cifar10, range(pivot))
val_set = torch.utils.data.Subset(cifar10, range(pivot, len(cifar10)))
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True)‘’’

class Network(nn.Module):
def init(self):
super().init()
#Using padding convolution 2d if downsampling is performed by average pooling
self.conv1 = nn.Conv2d(3, 6, kernel_size = 5, padding = 2)
#MaxPooling2D has no attribute with torch.nn so changed it to MaxPool2d
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, kernel_size = 5, padding = 2)
self.fc1 = nn.Linear(8816, 120)
self.fc2 = nn.Linear(120, 2)
self.fc3 = nn.Linear(2, 10)

def forward(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    #Flatten has no attribute with torch so changed it to flatten
    x = torch.flatten(x, 1)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

model = Network()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e3, momentum=0.9)

with wandb.init(project = ‘Tier-1-Test’, save_code=True) as run:
for epoch in range(5):
current_loss = 0

    model.train()

    for i, data in enumerate(train_loader):
        images, labels = data

        outputs = model(images)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        current_loss += loss
        
    run.log({'train_loss': current_loss / (i + 1)}) 
    
    model.eval()

    current_loss = 0

    for i, data in enumerate(val_loader):
        images, labels = data
        outputs = model(images)

        loss = criterion(outputs, labels)

        current_loss += loss

    run.log({'val_loss': current_loss / (i + 1)})

You are sorting the labels in:

cifar10 = sorted(cifar10, key=lambda x: x[1])

and split this sorted dataset into a training and validation Subset which would move classes 0-7 into the train_set and 8-9 into the val_loader based on the pivot.
Your model thus has never seen any classes present in the validation split and I would expect it to fail.

1 Like

Oh alright yea that makes sense! Probably a dumb question because I am missing something. But I changed the range of the val_set by combining it with the first row but I do not thing it got the classes still. Is there a better way for this?

cifar10 = sorted(cifar10, key=lambda x: x[1])
train_set = torch.utils.data.Subset(cifar10, range(pivot))
val_set_1 = torch.utils.data.Subset(cifar10, range(0))
val_set_2 = torch.utils.data.Subset(cifar10, range(pivot, len(cifar10)))
val_set = torch.utils.data.ConcatDataset([val_set_1, val_set_2])

range(0) is an empty range and this val_set_1 is also empty, so your code snippet doesn’t change anything.
You could either remove the sorting, split the train and val indices manually by making sure both sets see all classes, or you could also use e.g. sklearn.model_selection.train_test_split with the stratify argument.