I’ve been working on a notebook where I try to implement VGG-16 from scratch using PyTorch. As a sanity check I tried to train my model on CIFAR-10, and it was training very slowly (around 20 mins per epoch which I think is too long for CIFAR-10). I was running this on google colab with GPU enabled.
So I tried to train the data on a simple 3 layer CNN just to see if I screwed something up while implementing VGG. The result was that the validation loss decreased for the first few epochs and then increased and stayed around constant.
Here’s the loss for the small CNN
cuda:0
Training begin!
[1, 500] training loss: 2.160
[1, 1000] training loss: 2.090
[1, 1500] training loss: 2.063
[1, 2000] training loss: 2.053
[1, 2500] training loss: 2.043
Validating!
[1, 625] val loss: 3.293, val acc: 0.151
[2, 500] training loss: 2.026
[2, 1000] training loss: 2.001
[2, 1500] training loss: 2.011
[2, 2000] training loss: 2.009
[2, 2500] training loss: 2.006
Validating!
[2, 625] val loss: 3.229, val acc: 0.215
[3, 500] training loss: 1.989
[3, 1000] training loss: 1.974
[3, 1500] training loss: 1.980
[3, 2000] training loss: 1.977
[3, 2500] training loss: 1.975
Validating!
[3, 625] val loss: 3.185, val acc: 0.253
[4, 500] training loss: 1.961
[4, 1000] training loss: 1.977
[4, 1500] training loss: 1.960
[4, 2000] training loss: 1.957
[4, 2500] training loss: 1.955
Validating!
[4, 625] val loss: 3.219, val acc: 0.223
[5, 500] training loss: 1.953
[5, 1000] training loss: 1.951
[5, 1500] training loss: 1.952
[5, 2000] training loss: 1.938
[5, 2500] training loss: 1.947
Validating!
[5, 625] val loss: 3.269, val acc: 0.164
[6, 500] training loss: 1.944
[6, 1000] training loss: 1.943
[6, 1500] training loss: 1.942
[6, 2000] training loss: 1.931
[6, 2500] training loss: 1.939
Validating!
[6, 625] val loss: 3.294, val acc: 0.149
[7, 500] training loss: 1.929
[7, 1000] training loss: 1.927
[7, 1500] training loss: 1.937
[7, 2000] training loss: 1.924
[7, 2500] training loss: 1.927
Validating!
[7, 625] val loss: 3.286, val acc: 0.160
[8, 500] training loss: 1.919
[8, 1000] training loss: 1.931
[8, 1500] training loss: 1.924
[8, 2000] training loss: 1.921
[8, 2500] training loss: 1.924
Validating!
[8, 625] val loss: 3.307, val acc: 0.139
[9, 500] training loss: 1.915
[9, 1000] training loss: 1.919
[9, 1500] training loss: 1.913
[9, 2000] training loss: 1.912
[9, 2500] training loss: 1.918
Validating!
[9, 625] val loss: 3.283, val acc: 0.163
[10, 500] training loss: 1.908
[10, 1000] training loss: 1.907
[10, 1500] training loss: 1.906
[10, 2000] training loss: 1.917
[10, 2500] training loss: 1.912
Validating!
[10, 625] val loss: 3.244, val acc: 0.203
[11, 500] training loss: 1.905
[11, 1000] training loss: 1.908
[11, 1500] training loss: 1.904
[11, 2000] training loss: 1.910
[11, 2500] training loss: 1.916
Validating!
[11, 625] val loss: 3.356, val acc: 0.106
[12, 500] training loss: 1.914
[12, 1000] training loss: 1.900
[12, 1500] training loss: 1.903
[12, 2000] training loss: 1.895
[12, 2500] training loss: 1.894
Validating!
[12, 625] val loss: 3.315, val acc: 0.131
[13, 500] training loss: 1.902
[13, 1000] training loss: 1.898
[13, 1500] training loss: 1.895
[13, 2000] training loss: 1.904
[13, 2500] training loss: 1.904
Validating!
[13, 625] val loss: 3.337, val acc: 0.115
[14, 500] training loss: 1.896
[14, 1000] training loss: 1.905
[14, 1500] training loss: 1.904
[14, 2000] training loss: 1.891
[14, 2500] training loss: 1.889
Validating!
[14, 625] val loss: 3.284, val acc: 0.169
[15, 500] training loss: 1.898
[15, 1000] training loss: 1.896
[15, 1500] training loss: 1.889
[15, 2000] training loss: 1.892
[15, 2500] training loss: 1.890
Validating!
[15, 625] val loss: 3.356, val acc: 0.106
[16, 500] training loss: 1.891
[16, 1000] training loss: 1.887
[16, 1500] training loss: 1.887
[16, 2000] training loss: 1.899
[16, 2500] training loss: 1.886
Validating!
[16, 625] val loss: 3.350, val acc: 0.107
[17, 500] training loss: 1.891
[17, 1000] training loss: 1.883
[17, 1500] training loss: 1.884
[17, 2000] training loss: 1.889
[17, 2500] training loss: 1.885
Validating!
[17, 625] val loss: 3.332, val acc: 0.121
[18, 500] training loss: 1.878
[18, 1000] training loss: 1.886
[18, 1500] training loss: 1.886
[18, 2000] training loss: 1.884
[18, 2500] training loss: 1.880
Validating!
[18, 625] val loss: 3.341, val acc: 0.117
[19, 500] training loss: 1.873
[19, 1000] training loss: 1.877
[19, 1500] training loss: 1.883
[19, 2000] training loss: 1.887
[19, 2500] training loss: 1.883
Validating!
[19, 625] val loss: 3.351, val acc: 0.108
[20, 500] training loss: 1.876
[20, 1000] training loss: 1.881
[20, 1500] training loss: 1.882
[20, 2000] training loss: 1.879
[20, 2500] training loss: 1.885
Validating!
[20, 625] val loss: 3.333, val acc: 0.124
Here’s my random CNN I threw together to try and see if the reason was because VGG was too complex for CIFAR-10
class MyCNN(nn.Module):
def __init__(self):
super(MyCNN, self).__init__()
#nn.Conv2d(input_channels, output_channels, kernel_size, padding)
self.conv1_1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
self.conv1_2 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
self.conv1_3 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
self.bn1 = nn.BatchNorm2d(32)
#nn.MaxPool2d(kernel_size, stride)
self.maxpool = nn.MaxPool2d(2, stride=2)
self.fc6 = nn.Linear(16*16*32, 128)
self.bn_fc = nn.BatchNorm1d(128)
self.fc7 = nn.Linear(128, 10)
self.dropout = nn.Dropout()
def forward(self, x):
#conv block 1
x = self.conv1_1(x)
x = F.relu(self.bn1(x))
x = self.conv1_2(x)
x = F.relu(self.bn1(x))
x = self.conv1_3(x)
x = F.relu(self.bn1(x))
x = self.maxpool(x)
#Now we need to flatten the tensor so that it'll fit into the FC layer
x = x.reshape(-1, 16 * 16 * 32)
#fc6
x = self.fc6(x)
x = F.relu(self.bn_fc(x))
x = self.dropout(x)
#output layer
x = self.fc7(x)
x = F.softmax(x, dim=1)
return x
Here’s my implementation of VGG-16
class MyVGG16(nn.Module):
def __init__(self):
super(MyVGG16, self).__init__()
#nn.Conv2d(input_channels, output_channels, kernel_size, padding)
self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
#VGG originally didn't have batch norm, since it was before batch norm
#was invented. But adding it provides additional performance, so might
#as well.
self.bn1 = nn.BatchNorm2d(64)
self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.bn2 = nn.BatchNorm2d(128)
self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.bn3 = nn.BatchNorm2d(256)
self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.bn4 = nn.BatchNorm2d(512)
self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.bn5 = nn.BatchNorm2d(512)
#nn.MaxPool2d(kernel_size, stride)
self.maxpool = nn.MaxPool2d(2, stride=2)
self.fc6 = nn.Linear(7*7*512, 4096)
self.bn_fc = nn.BatchNorm1d(4096)
self.fc7 = nn.Linear(4096, 4096)
#Here we change the final output from 1000 to 10. This is because the
#number of outputs here correspond to the number of classes. VGG was
#originally trained for ImageNet, which has 1000 classes. For our purposes,
#we only have 10 classes, so we will put 10 here.
self.fc8 = nn.Linear(4096, 10)
self.dropout = nn.Dropout()
def forward(self, x):
#conv block 1
x = self.conv1_1(x)
x = F.relu(self.bn1(x))
x = self.conv1_2(x)
x = F.relu(self.bn1(x))
x = self.maxpool(x)
#conv block 2
x = self.conv2_1(x)
x = F.relu(self.bn2(x))
x = self.conv2_2(x)
x = F.relu(self.bn2(x))
x = self.maxpool(x)
#conv block 3
x = self.conv3_1(x)
x = F.relu(self.bn3(x))
x = self.conv3_2(x)
x = F.relu(self.bn3(x))
x = self.conv3_3(x)
x = F.relu(self.bn3(x))
x = self.maxpool(x)
#conv block 4
x = self.conv4_1(x)
x = F.relu(self.bn4(x))
x = self.conv4_2(x)
x = F.relu(self.bn4(x))
x = self.conv4_3(x)
x = F.relu(self.bn4(x))
x = self.maxpool(x)
#conv block 5
x = self.conv5_1(x)
x = F.relu(self.bn5(x))
x = self.conv5_2(x)
x = F.relu(self.bn5(x))
x = self.conv5_3(x)
x = F.relu(self.bn5(x))
x = self.maxpool(x)
#Now we need to flatten the tensor so that it'll fit into the FC layer
x = x.reshape(-1, 7 * 7 * 512)
#fc6
x = self.fc6(x)
x = F.relu(self.bn_fc(x))
x = self.dropout(x)
#fc7
x = self.fc7(x)
x = F.relu(self.bn_fc(x))
x = self.dropout(x)
#output layer
x = self.fc8(x)
x = F.softmax(x, dim=1)
return x
Here’s my training code
def train(model, criterion, optimizer, epochs):
print("Training begin!\n")
for epoch in range(epochs): # loop over the dataset multiple times
running_loss = 0.0
model.train()
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data[0].to(device), data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 100 == 99: # print every 100 mini-batches
print('[%d, %5d] training loss: %.3f' %
(epoch + 1, i + 1, running_loss / 100))
running_loss = 0.0
print("Validating!\n")
val_loss = 0.0
val_total = 0
val_correct = 0
model.eval()
with torch.no_grad():
#validation step after every epoch
for i, data in enumerate(valloader, 0):
inputs, labels = data[0].to(device), data[1].to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
score, predictions = torch.max(outputs.data, 1)
val_total += labels.size(0)
val_correct += (predictions == labels).sum().item()
val_loss += loss.item()
print('[%d, %5d] val loss: %.3f, val acc: %.3f' %
(epoch + 1, i + 1, val_loss / i + 1, val_correct / val_total))
val_loss = 0.0
val_total = 0
val_correct = 0
print('Finished Training')
Here are my hyperparameters
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = MyVGG16()
model.to(device)
def weights_init(m):
if isinstance(m, nn.Conv2d):
torch.nn.init.xavier_uniform_(m.weight.data)
model.apply(weights_init)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum = 0.9)
train(model, criterion, optimizer, 20)
I’m not exactly sure where I went wrong, so any advice would be greatly appreciated.