I’m trying to implement batch normalization in pytorch and apply it into VGG16 network. Here’s my batchnorm below.
class BatchNorm(nn.Module):
def __init__(self, input, mode, momentum=0.9, epsilon=1e-05):
'''
input: assume 4D input (mini_batch_size, # channel, w, h)
momentum: momentum for exponential average
'''
super(BatchNorm, self).__init__()
#self.run_mode = run_mode
#self.input_shape = input.shape
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.momentum = momentum
self.run_mode = 0 # 0: training, 1: testing
self.insize = input
self.epsilon = epsilon
# initialize weight(gamma), bias(beta), running mean and variance
U = uniform.Uniform(torch.tensor([0.0]), torch.tensor([1.0]))
self.weight = nn.Parameter(U.sample(torch.Size([self.insize])).view(self.insize)) ## TODO
self.bias = nn.Parameter(torch.zeros(self.insize)) ## TODO
self.running_mean = torch.zeros(self.insize)
self.running_var = torch.ones(self.insize)
# def set_runmode(self, run_mode):
# self.run_mode = run_mode
def forward(self, input, mode):
if mode == 0:
mean = input.mean([0,2,3]) # along channel axis
var = input.var([0,2,3])
# update running mean and var
running_mean_current = self.momentum * self.running_mean
running_mean_current = running_mean_current.to(self.device)
self.running_mean = running_mean_current + (1.0-self.momentum) * mean
running_var_current = self.momentum * self.running_var
running_var_current = running_var_current.to(self.device)
self.running_var = running_var_current + (1.0-self.momentum) * (input.shape[0]/(input.shape[0]-1)*var)
# change shape
current_mean = mean.view([1, self.insize, 1, 1]).expand_as(input)
current_var = var.view([1, self.insize, 1, 1]).expand_as(input)
current_weight = self.weight.view([1, self.insize, 1, 1]).expand_as(input)
current_bias = self.bias.view([1, self.insize, 1, 1]).expand_as(input)
# get output
y = current_weight * (input - current_mean) / (
current_var + self.epsilon).sqrt() + current_bias
else:
mean = self.running_mean
var = self.running_var
# change shape
current_mean = mean.view([1, self.insize, 1, 1]).expand_as(input)
current_var = var.view([1, self.insize, 1, 1]).expand_as(input)
current_weight = self.weight.view([1, self.insize, 1, 1]).expand_as(input)
current_bias = self.bias.view([1, self.insize, 1, 1]).expand_as(input)
# get output
y = current_weight.data.cpu() * (input.data.cpu() - current_mean) / (
current_var + self.epsilon).sqrt() + current_bias.data.cpu()
y = y.cuda()
return y
and here is how customized batchnorm is being called in the VGG16 network.
class VGG(nn.Module):
def __init__(self):
super(VGG, self).__init__()
self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.bn1_1 = batchnorm.BatchNorm(64, mode=0)
self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.bn1_2 = batchnorm.BatchNorm(64, mode=0)
self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.bn2_1 = batchnorm.BatchNorm(128, mode=0)
self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.bn2_2 = batchnorm.BatchNorm(128, mode=0)
self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.bn3_1 = batchnorm.BatchNorm(256, mode=0)
self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.bn3_2 = batchnorm.BatchNorm(256, mode=0)
self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.bn3_3 = batchnorm.BatchNorm(256, mode=0)
self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.bn4_1 = batchnorm.BatchNorm(512, mode=0)
self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.bn4_2 = batchnorm.BatchNorm(512, mode=0)
self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.bn4_3 = batchnorm.BatchNorm(512, mode=0)
self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.bn5_1 = batchnorm.BatchNorm(512, mode=0)
self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.bn5_2 = batchnorm.BatchNorm(512, mode=0)
self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.bn5_3 = batchnorm.BatchNorm(512, mode=0)
self.avgpool = nn.AvgPool2d(kernel_size=1, stride=1)
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
self.classifier = nn.Linear(512, 10)
def forward(self, x, mode):
out = F.relu(self.bn1_1(self.conv1_1(x), mode))
out = self.pool(F.relu(self.bn1_2(self.conv1_2(out), mode)))
out = F.relu(self.bn2_1(self.conv2_1(out), mode))
out = self.pool(F.relu(self.bn2_2(self.conv2_2(out), mode)))
out = F.relu(self.bn3_1(self.conv3_1(out), mode))
out = F.relu(self.bn3_2(self.conv3_2(out), mode))
out = self.pool(F.relu(self.bn3_3(self.conv3_3(out), mode)))
out = F.relu(self.bn4_1(self.conv4_1(out), mode))
out = F.relu(self.bn4_2(self.conv4_2(out), mode))
out = self.pool(F.relu(self.bn4_3(self.conv4_3(out), mode)))
out = F.relu(self.bn5_1(self.conv5_1(out), mode))
out = F.relu(self.bn5_2(self.conv5_2(out), mode))
out = self.avgpool(self.pool(F.relu(self.bn5_3(self.conv5_3(out), mode))))
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out
However, I figured out that whenever network gets trained, running_mean and running_var remains same as the initialization (0 and 1) respectively. I can’t figure out which part I’m missing. Any help would be appreciated!