Hello.

I try to translate model from PyTorch to Tensorflow. I successfully converted net architecture and weights from PyTorch to TF but I found inconsistency in batch normalization layers. Expected results differ much between models with exactly same weights. After some investigation I found something I can’t explain.

In PyTorch, when I compute output of bn layer directly from equation, the result is much different from result obtained from forward pass.

Can anyone explain this to me?

(Python 3.6, PyTorch 0.4.0)

```
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.module_list = nn.ModuleList()
module = nn.Sequential()
conv = nn.Conv2d(3, 32, 3, 1, 1, bias=False)
module.add_module('conv_0', conv)
bn = nn.BatchNorm2d(32)
module.add_module('batch_norm_0', bn)
gamma = np.random.rand(32)
gamma = torch.from_numpy(gamma)
bn.weight.data.copy_(gamma)
beta = np.random.rand(32)
beta = torch.from_numpy(beta)
bn.bias.data.copy_(beta)
mean = np.random.rand(32)
mean = torch.from_numpy(mean)
bn.running_mean.data.copy_(mean)
var = np.random.rand(32)
var = torch.from_numpy(var)
bn.running_var.data.copy_(var)
self.module_list.append(module)
def forward(self, input):
conv = self.module_list[0][0](input)
bn = self.module_list[0][1](conv)
return conv, bn
if __name__ == '__main__':
model = Model()
bn = model.module_list[0][1]
gamma = bn.weight.data.numpy().reshape(1, 32, 1, 1)
beta = bn.bias.data.numpy().reshape(1, 32, 1, 1)
mean = bn.running_mean.numpy().reshape(1, 32, 1, 1)
var = bn.running_var.numpy().reshape(1, 32, 1, 1)
x = np.random.rand(1, 3, 64, 64)
x = Variable(torch.from_numpy(x).float())
conv_out, bn_out = model.forward(x)
conv_out = conv_out.data.numpy()
x = ((conv_out - mean) / np.sqrt(var + 1e-05)) * gamma + beta
# I expect this to be negligible but is ~0.5
print(np.sum(np.abs(bn_out.data.numpy() - x))/np.prod(x.shape))
```