I decided to make my own implementation of batch normalisation in the class below:
class MyBatchNorm1d(nn.Module):
def __init__(self, features, eps=1e-5, momentum=0.1):
super().__init__()
self.features = features
self.eps = eps
self.momentum = momentum
self.weight = nn.Parameter(torch.ones(features))
self.bias = nn.Parameter(torch.zeros(features))
self.running_mean = torch.zeros(features)
self.running_var = torch.ones(features)
def forward(self, x):
if self.training:
batch_mean = x.mean(dim=0)
batch_var = x.var(dim=0, correction=0)
with torch.no_grad():
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * batch_mean.squeeze()
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * batch_var.squeeze()
normalised = (x - batch_mean) / torch.sqrt(batch_var + self.eps)
else:
normalised = (x - self.running_mean) / torch.sqrt(self.running_var + self.eps)
return normalised * self.weight + self.bias
Note that this implementation only handles 2D arrays.
I then made a unit test to check that my implementation works in the same way as pytorch’s BatchNorm1d
layer.
class MyTestCase(unittest.TestCase):
def test_bn1d(self):
batch_norm_torch = nn.BatchNorm1d(50).double()
batch_norm_mine = MyBatchNorm1d(50).double()
batch_size = 16
# Tests during training mode
for i in range(10):
inp_tensor = (torch.rand(batch_size, 50) * torch.rand(batch_size, 50) * 5 + torch.rand(batch_size, 50) * 10).double()
left = batch_norm_torch(inp_tensor)
right = batch_norm_mine(inp_tensor)
self.assertTrue(torch.isclose(torch.norm(left - right), torch.tensor(0.0).double())) # all of these pass
batch_norm_mine.eval()
batch_norm_torch.eval()
inp_tensor = (torch.rand(batch_size, 50) * torch.rand(batch_size, 50) * 5 + torch.rand(batch_size, 50) * 10).double()
left = batch_norm_torch(inp_tensor)
right = batch_norm_mine(inp_tensor)
self.assertTrue(torch.isclose(torch.norm(left - right), torch.tensor(0.0).double())) # assertion error here for some reason. Maybe it's implementation differences, or just precision.
My implementation is clearly correct in training mode, but when setting the layers to eval there is a significant difference between the outputs. I suspect this is because of the running mean and variance computations. Is this the case, or is there something else in play that I haven’t noticed?
The full code can be found here: u7122029/batch-normalisation