I’m seeing consistent change in results when I make a trivial change to my code.
Original version:
class LeNet(nn.Module):
def __init__(self, nfilters=32, nclasses=10, linear=128):
super(LeNet, self).__init__()
self.linear1 = nn.Linear(nfilters*5*5, linear)
self.linear2 = nn.Linear(linear, nclasses)
self.dropout = nn.Dropout()
self.act = nn.ReLU(inplace=True)
self.batch_norm = nn.BatchNorm1d(linear)
self.first_layers = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=nfilters, kernel_size=5),
nn.BatchNorm2d(nfilters),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
self.act,
nn.Conv2d(in_channels=nfilters, out_channels=nfilters, kernel_size=5),
nn.BatchNorm2d(nfilters),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
self.act,
)
def forward(self, x):
x = self.first_layers(x)
x = x.view(x.size(0), -1)
x - self.dropout(x)
x = self.linear1(x)
x = self.batch_norm(x)
x = self.act(x)
x = self.dropout(x)
x = self.linear2(x)
return x
Modified version:
class LeNet(nn.Module):
def __init__(self, nfilters=32, nclasses=10, linear=128):
super(LeNet, self).__init__()
self.linear1 = nn.Linear(nfilters*5*5, linear)
self.linear2 = nn.Linear(linear, nclasses)
self.dropout = nn.Dropout()
self.act = nn.ReLU(inplace=True)
self.batch_norm = nn.BatchNorm1d(linear)
self.first_layers = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=nfilters, kernel_size=5),
nn.BatchNorm2d(nfilters),
nn.MaxPool2d(kernel_size=2, stride=2),
self.act,
nn.Conv2d(in_channels=nfilters, out_channels=nfilters, kernel_size=5),
nn.BatchNorm2d(nfilters),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
self.act,
)
self.last_layers = nn.Sequential(
self.dropout,
self.linear1,
self.batch_norm,
self.act,
self.dropout,
self.linear2,
)
def forward(self, x):
x = self.first_layers(x)
x = x.view(x.size(0), -1)
x = self.last_layers(x)
return x
As you can see, the only change is wrapping last layers into nn.Sequential function. Here are the results:
First version:
Epoch 0 Train: Loss 0.29 Accuracy 93.96 Test: Loss 0.05 Accuracy 98.73
Epoch 1 Train: Loss 0.10 Accuracy 97.05 Test: Loss 0.03 Accuracy 98.90
Epoch 2 Train: Loss 0.08 Accuracy 97.85 Test: Loss 0.03 Accuracy 98.98
Second version (using self.last_layers):
Epoch 0 Train: Loss 0.41 Accuracy 90.10 Test: Loss 0.06 Accuracy 98.46
Epoch 1 Train: Loss 0.16 Accuracy 95.42 Test: Loss 0.04 Accuracy 98.69
Epoch 2 Train: Loss 0.12 Accuracy 96.28 Test: Loss 0.04 Accuracy 98.84
As you can see, the first version trains faster, and even though this might appear as a slight difference, it’s consistent, while repeated run of the same code produces almost identical results. I tracked this down to dropout layers, if I comment out dropout layers in self.last_layers and forward, the difference disappears.