Thank you so much for the swift response. Here is one more question pertaining to VGG16. I read through your comments and found out it may be wise to study each layer carefully as one may perform model surgery on the layers especially when finetuning. I hard-coded out a VGG16 from scratch and compared it with the “proper version” in the source code of VGG16. I thought there isn’t any difference beside the fact that the source code made use of Sequential
and it’s neater (I would tidy up). However, when using the exact same seed (it is a very robust seeding method), I get different results when using the my hard coded one vs the one from source code, both pretrained=False
.
class VGG16(torch.nn.Module):
def __init__(self, init_weights=True):
super(VGG16, self).__init__()
self.conv1 = torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv2 = torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv3 = torch.nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv4 = torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv5 = torch.nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv6 = torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv7 = torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv8 = torch.nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv9 = torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv10 = torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv11 = torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv12 = torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=(1,1))
self.conv13 = torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=(1,1))
# Sequential Linear (fully-connected) Layers with affine operations y=Wx+b
self.fc1 = torch.nn.Linear(in_features=25088,out_features=4096, bias=True)
self.fc2 = torch.nn.Linear(in_features=4096,out_features=4096, bias=True)
# last layer before softmax - usually called include_top in Keras.
self.fc3 = torch.nn.Linear(in_features=4096,out_features=1000, bias=True)
# completed 16 layers, hence the name VGG16
self.dropout = torch.nn.Dropout(p=0.5, inplace=False)
self.activation = torch.nn.ReLU(inplace=True)
self.avgpool = torch.nn.AdaptiveAvgPool2d((7, 7))
if init_weights:
self._initialize_weights()
def forward(self, input_neurons: torch.Tensor)-> torch.Tensor:
input_neurons = self.activation(self.conv1(input_neurons))
input_neurons = self.activation(self.conv2(input_neurons))
# note here we are using maxpooling with stride 2 on conv2 layer before we proceed to conv3
input_neurons = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)(self.conv2(input_neurons))
input_neurons = self.activation(self.conv3(input_neurons))
input_neurons = self.activation(self.conv4(input_neurons))
input_neurons = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)(self.conv4(input_neurons))
input_neurons = self.activation(self.conv5(input_neurons))
input_neurons = self.activation(self.conv6(input_neurons))
input_neurons = self.activation(self.conv7(input_neurons))
input_neurons = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)(self.conv7(input_neurons))
input_neurons = self.activation(self.conv8(input_neurons))
input_neurons = self.activation(self.conv9(input_neurons))
input_neurons = self.activation(self.conv10(input_neurons))
input_neurons = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)(self.conv10(input_neurons))
input_neurons = self.activation(self.conv11(input_neurons))
input_neurons = self.activation(self.conv12(input_neurons))
input_neurons = self.activation(self.conv13(input_neurons))
input_neurons = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)(self.conv13(input_neurons))
# Adaptive Layer
input_neurons = self.avgpool(input_neurons)
# Flatten
input_neurons = torch.flatten(input_neurons,1)
# or
# input_neurons = torch.view(input_neurons, -1)
# Fully Connected Layers Below
input_neurons = self.dropout(self.activation(self.fc1(input_neurons)))
input_neurons = self.dropout(self.activation(self.fc2(input_neurons)))
input_neurons = self.fc3(input_neurons)
return input_neurons
def _initialize_weights(self) -> None:
for m in self.modules():
if isinstance(m, torch.nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
torch.nn.init.constant_(m.bias, 0)
elif isinstance(m, torch.nn.BatchNorm2d):
torch.nn.init.constant_(m.weight, 1)
torch.nn.init.constant_(m.bias, 0)
elif isinstance(m, torch.nn.Linear):
torch.nn.init.normal_(m.weight, 0, 0.01)
torch.nn.init.constant_(m.bias, 0)
Then I defined a dummy tensor
rand_tensor = torch.ones(8, 3, 64, 64, dtype=torch.float).to(device)
and compare both versions as follows:
vgg16hongnan = vgg(arch='vgg16', pretrained=False, progress=True)
vgg16hongnan= vgg16hongnan.to(device)
vgg16v1 = models.vgg16(pretrained=False)
vgg16v1=vgg16v1.to(device)
vgg16hongnan(rand_tensor)
gives a different answer from vgg16v1(rand_tensor)
. I reckon I made some layers error in between… But I checked a few times and thought it’s fine. PS: I made sure to run it on the same GPU and clear cache everytime to ensure determinsitic results.