Hello,
I’m new to PyTorch and try to train a model. The network needs to be defined in Sequential and I want to train it on CIFAR10. Unfortunately, something isn’t working correctly, since the Loss and Accuracy don’t improve. I will walk you through the code step by step to make it more comprehensible.
First I’m setting a seed and do the data gathering:
s = 127
np.random.seed(s)
torch.manual_seed(s)
torch.cuda.manual_seed(s)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
BATCH_SIZE_TRAIN_CIFAR10 = 128
BATCH_SIZE_TEST_CIFAR10 = 128
transform_base = [transforms.ToTensor()]
transform_train = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, padding=4, padding_mode='reflect'),
] + transform_base)
transform_test = transforms.Compose(transform_base)
transform_train = transforms.RandomChoice([transform_train, transform_test])
#~/data/cifar10
CIFAR10_trainset = torchvision.datasets.CIFAR10(root='~/data/cifar10', train=True, download=True, transform=transform_train)
CIFAR10_train_loader = torch.utils.data.DataLoader(CIFAR10_trainset, batch_size=BATCH_SIZE_TRAIN_CIFAR10, shuffle=True, num_workers=2)
#~/data/cifar10
CIFAR10_testset = torchvision.datasets.CIFAR10(root='~/data/cifar10', train=False, download=True, transform=transform_test)
CIFAR10_test_loader = torch.utils.data.DataLoader(CIFAR10_testset, batch_size=BATCH_SIZE_TEST_CIFAR10, shuffle=False, num_workers=2)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
Now I define the model, which needs to be in Sequential, because of a package I will after the network is trained …:
def VGG11(num_classes=10):
features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
nn.Flatten(),
nn.Dropout(p=0.5, inplace=False),
nn.Linear(in_features=512, out_features=512, bias=True),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5, inplace=False),
nn.Linear(in_features=512, out_features=512, bias=True),
nn.ReLU(inplace=True),
nn.Linear(in_features=512, out_features=num_classes, bias=True)
)
return features
device = 'cuda' if torch.cuda.is_available() else 'cpu'
cuda_status = torch.cuda.is_available()
CIFAR10_model = VGG11(num_classes=10).to(device)
best_acc = 0 # best test accuracy
start_epoch = 0 # start from epoch 0 or last checkpoint epoch
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(CIFAR10_model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
Now I define the training and testing procedure:
def train(net, epoch, optimizer, trainloader, filename):
print('\nEpoch: %d' % epoch)
net.train()
train_loss = 0
correct = 0
total = 0
for batch_idx, (inputs, targets) in enumerate(trainloader):
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
train_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
print("train loss: ", train_loss)
print("train accuracy: ", correct / total)
print("saving model at: {}".format(filename))
torch.save(net.state_dict(), filename)
def test(net, epoch, testloader, path, save=False):
global best_acc
net.eval()
test_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(testloader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
test_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
acc = correct / total
if acc > best_acc and save:
best_acc = acc
print("saving model at: {}".format(path))
torch.save(net.state_dict(), path)
print("test loss: ", test_loss)
print("current acc: {}; best acc: {}".format(acc, best_acc))
And in the end, I’m defining a method to train and test the network while changing the learning rate after e
epochs.
def train_all():
CIFAR10_path = './checkpoint/ckpt_seed{}.pth'.format(s)
CIFAR10_path_best = './checkpoint/best_ckpt_seed{}.pth'.format(s)
lr = 0.1
epoch = 0
for e in [30, 50, 50]:
print("current learning rate: ", lr)
for _ in range(e):
optimizer = optim.SGD(CIFAR10_model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
train(CIFAR10_model, epoch, optimizer, CIFAR10_train_loader, CIFAR10_path)
test(CIFAR10_model, epoch, CIFAR10_test_loader, save=True, path=CIFAR10_path_best)
epoch += 1
lr /= 10
as you can see I’m printing the loss during training and testing.
The outputs I get when running the code looks like this:
PyTorch version: 1.6.0
cuda available: True
Files already downloaded and verified
Files already downloaded and verified
current learning rate: 0.1
Epoch: 0
train loss: 901.1207406520844
train accuracy: 0.09972
saving model at: ./checkpoint/ckpt_seed3.pth
saving model at: ./checkpoint/best_ckpt_seed3.pth
test loss: 182.16285347938538
current acc: 0.1; best acc: 0.1
Epoch: 1
train loss: 901.0276117324829
train accuracy: 0.1003
saving model at: ./checkpoint/ckpt_seed3.pth
test loss: 182.1284966468811
current acc: 0.1; best acc: 0.1
Epoch: 2
train loss: 900.8636813163757
train accuracy: 0.10024
saving model at: ./checkpoint/ckpt_seed3.pth
test loss: 181.99637913703918
current acc: 0.1; best acc: 0.1
...
...
Epoch: 128
train loss: 900.3234491348267
train accuracy: 0.0977
saving model at: ./checkpoint/ckpt_seed3.pth
test loss: 181.90408873558044
current acc: 0.1; best acc: 0.1
Epoch: 129
train loss: 900.322544336319
train accuracy: 0.09878
saving model at: ./checkpoint/ckpt_seed3.pth
test loss: 181.9042353630066
current acc: 0.1; best acc: 0.1
As you can see the loss and accuracy don’t really change at all…
I also tried to begin with a smaller learning rate (0.01), but I got the same outputs pretty much.
Can someone see what I’m doing wrong or give me any tips?