Batch normalization forward pass vs compute by hand

Hello.

I try to translate model from PyTorch to Tensorflow. I successfully converted net architecture and weights from PyTorch to TF but I found inconsistency in batch normalization layers. Expected results differ much between models with exactly same weights. After some investigation I found something I can’t explain.

In PyTorch, when I compute output of bn layer directly from equation, the result is much different from result obtained from forward pass.

Can anyone explain this to me?

(Python 3.6, PyTorch 0.4.0)

import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.module_list = nn.ModuleList()
        module = nn.Sequential()

        conv = nn.Conv2d(3, 32, 3, 1, 1, bias=False)
        module.add_module('conv_0', conv)

        bn = nn.BatchNorm2d(32)
        module.add_module('batch_norm_0', bn)

        gamma = np.random.rand(32)
        gamma = torch.from_numpy(gamma)
        bn.weight.data.copy_(gamma)

        beta = np.random.rand(32)
        beta = torch.from_numpy(beta)
        bn.bias.data.copy_(beta)

        mean = np.random.rand(32)
        mean = torch.from_numpy(mean)
        bn.running_mean.data.copy_(mean)

        var = np.random.rand(32)
        var = torch.from_numpy(var)
        bn.running_var.data.copy_(var)

        self.module_list.append(module)

    def forward(self, input):
        conv = self.module_list[0][0](input)
        bn = self.module_list[0][1](conv)
        return conv, bn


if __name__ == '__main__':
    model = Model()
    bn = model.module_list[0][1]
    gamma = bn.weight.data.numpy().reshape(1, 32, 1, 1)
    beta = bn.bias.data.numpy().reshape(1, 32, 1, 1)
    mean = bn.running_mean.numpy().reshape(1, 32, 1, 1)
    var = bn.running_var.numpy().reshape(1, 32, 1, 1)

    x = np.random.rand(1, 3, 64, 64)
    x = Variable(torch.from_numpy(x).float())
    conv_out, bn_out = model.forward(x)

    conv_out = conv_out.data.numpy()
    x = ((conv_out - mean) / np.sqrt(var + 1e-05)) * gamma + beta

    # I expect this to be negligible but is ~0.5
    print(np.sum(np.abs(bn_out.data.numpy() - x))/np.prod(x.shape))

Is your model in eval mode?

2 Likes

Thank you! Now it works just as I expected :slight_smile:

Thank you for the good solution!