Network (vgg11) doesn't train on CIFAR10

Hello,

I’m new to PyTorch and try to train a model. The network needs to be defined in Sequential and I want to train it on CIFAR10. Unfortunately, something isn’t working correctly, since the Loss and Accuracy don’t improve. I will walk you through the code step by step to make it more comprehensible.

First I’m setting a seed and do the data gathering:

s = 127
np.random.seed(s)
torch.manual_seed(s)
torch.cuda.manual_seed(s)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

BATCH_SIZE_TRAIN_CIFAR10 = 128
BATCH_SIZE_TEST_CIFAR10 = 128

transform_base = [transforms.ToTensor()]

transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4, padding_mode='reflect'),
    ] + transform_base)

transform_test = transforms.Compose(transform_base)
transform_train = transforms.RandomChoice([transform_train, transform_test])

#~/data/cifar10
CIFAR10_trainset = torchvision.datasets.CIFAR10(root='~/data/cifar10', train=True, download=True, transform=transform_train)
CIFAR10_train_loader = torch.utils.data.DataLoader(CIFAR10_trainset, batch_size=BATCH_SIZE_TRAIN_CIFAR10, shuffle=True, num_workers=2)

#~/data/cifar10
CIFAR10_testset = torchvision.datasets.CIFAR10(root='~/data/cifar10', train=False, download=True, transform=transform_test)
CIFAR10_test_loader = torch.utils.data.DataLoader(CIFAR10_testset, batch_size=BATCH_SIZE_TEST_CIFAR10, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Now I define the model, which needs to be in Sequential, because of a package I will after the network is trained …:


def VGG11(num_classes=10):
    features = nn.Sequential(
        nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
        nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
        nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
        nn.ReLU(inplace=True),
        nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
        nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
        nn.ReLU(inplace=True),
        nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
        nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
        nn.ReLU(inplace=True),
        nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
        nn.Flatten(),
        nn.Dropout(p=0.5, inplace=False),
        nn.Linear(in_features=512, out_features=512, bias=True),
        nn.ReLU(inplace=True),
        nn.Dropout(p=0.5, inplace=False),
        nn.Linear(in_features=512, out_features=512, bias=True),
        nn.ReLU(inplace=True),
        nn.Linear(in_features=512, out_features=num_classes, bias=True)
    )
    return features


device = 'cuda' if torch.cuda.is_available() else 'cpu'
cuda_status = torch.cuda.is_available()
CIFAR10_model = VGG11(num_classes=10).to(device)
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(CIFAR10_model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

Now I define the training and testing procedure:


def train(net, epoch, optimizer, trainloader, filename):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    print("train loss: ", train_loss)
    print("train accuracy: ", correct / total)
    print("saving model at: {}".format(filename))
    torch.save(net.state_dict(), filename)


def test(net, epoch, testloader, path, save=False):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        acc = correct / total
        if acc > best_acc and save:
            best_acc = acc
            print("saving model at: {}".format(path))
            torch.save(net.state_dict(), path)

        print("test loss: ", test_loss)
        print("current acc: {}; best acc: {}".format(acc, best_acc))

And in the end, I’m defining a method to train and test the network while changing the learning rate after e epochs.

def train_all():
    CIFAR10_path = './checkpoint/ckpt_seed{}.pth'.format(s)
    CIFAR10_path_best = './checkpoint/best_ckpt_seed{}.pth'.format(s)
    lr = 0.1
    epoch = 0
    for e in [30, 50, 50]:
        print("current learning rate: ", lr)
        for _ in range(e):
            optimizer = optim.SGD(CIFAR10_model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
            train(CIFAR10_model, epoch, optimizer, CIFAR10_train_loader, CIFAR10_path)
            test(CIFAR10_model, epoch, CIFAR10_test_loader, save=True, path=CIFAR10_path_best)
            epoch += 1
        lr /= 10

as you can see I’m printing the loss during training and testing.
The outputs I get when running the code looks like this:

PyTorch version:  1.6.0
cuda available:  True
Files already downloaded and verified
Files already downloaded and verified
current learning rate:  0.1

Epoch: 0
train loss:  901.1207406520844
train accuracy:  0.09972
saving model at: ./checkpoint/ckpt_seed3.pth
saving model at: ./checkpoint/best_ckpt_seed3.pth
test loss:  182.16285347938538
current acc: 0.1; best acc: 0.1

Epoch: 1
train loss:  901.0276117324829
train accuracy:  0.1003
saving model at: ./checkpoint/ckpt_seed3.pth
test loss:  182.1284966468811
current acc: 0.1; best acc: 0.1

Epoch: 2
train loss:  900.8636813163757
train accuracy:  0.10024
saving model at: ./checkpoint/ckpt_seed3.pth
test loss:  181.99637913703918
current acc: 0.1; best acc: 0.1

...

...

Epoch: 128
train loss:  900.3234491348267
train accuracy:  0.0977
saving model at: ./checkpoint/ckpt_seed3.pth
test loss:  181.90408873558044
current acc: 0.1; best acc: 0.1

Epoch: 129
train loss:  900.322544336319
train accuracy:  0.09878
saving model at: ./checkpoint/ckpt_seed3.pth
test loss:  181.9042353630066
current acc: 0.1; best acc: 0.1

As you can see the loss and accuracy don’t really change at all…
I also tried to begin with a smaller learning rate (0.01), but I got the same outputs pretty much.

Can someone see what I’m doing wrong or give me any tips?