Question about Batch Normalization

The same model without the two self.bn.forward statements gives an accuracy drop to random guessing… does that make sense?

class WideNet(nn.Module):
    def __init__(self):
        super(WideNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
#         self.conv3 = nn.Conv2d(10, 20, kernel_size=2)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(5120, 500)
        self.fcmid = nn.Linear(500, 50)
        self.fc2 = nn.Linear(50, 10)
        self.bn1 = nn.BatchNorm2d(10)
        self.bn2 = nn.BatchNorm2d(20)

    def forward(self, x):
        x = F.leaky_relu(F.max_pool2d(self.conv1(x), 2))
        x = self.bn1.forward(x)
        x = F.upsample_bilinear(x, size=(16, 16))
        x = F.leaky_relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = self.bn2.forward(x)
        x = F.upsample_bilinear(x, size=(16, 16))
        x = x.view(-1, 5120)
        x = F.leaky_relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = F.leaky_relu(self.fcmid(x))
        x = F.dropout(x, training=self.training)
        x = F.leaky_relu(self.fc2(x))
        return F.log_softmax(x)

that is what batch normalization is used for. When you remove BatchNorm2d layer, I guess your net is suffering from vanishing gradient and saturationwhen training .
when testing, the input of upsample layer will have different scale, therefore most neuron is dead or saturate

Not sure, the leaky_relu doesnt seem to have that problem. The upsample is just local replication, so I’m not sure that’s the problem.

The issue is that it drops to 10%. The way I fixed it is to have:

    def forward(self, x):
        x = self.prelu_ac[0](self.bn[0](F.max_pool2d(self.conv1(x), 2)))
        #x = self.bn[0](x)
        x = F.upsample_bilinear(x, size=(16, 16))
        x = self.prelu_ac[1](self.bn[1](F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)))
        #x = self.bn[1](x)
        x = F.upsample_bilinear(x, size=(16, 16))
        x = x.view(-1, 5120)
        x = self.prelu_ac[2](self.bn[2](self.drop1(self.fc1(x))))
        #x = self.bn[2](x)
        x = F.dropout(x, training=self.training)
        x = self.prelu_ac[3](self.bn[3](self.drop2(self.fcmid(x))))
        #x = self.bn[3](x)
        x = F.dropout(x, training=self.training)
        x = self.prelu_ac[4](self.bn[4](self.fc2(x)))
        #x = self.bn[4](x)
        return F.log_softmax(x)

Any tips for improvement? Should I drop the upsample? I think it helps the second conv layer