Hello.
I try to translate model from PyTorch to Tensorflow. I successfully converted net architecture and weights from PyTorch to TF but I found inconsistency in batch normalization layers. Expected results differ much between models with exactly same weights. After some investigation I found something I can’t explain.
In PyTorch, when I compute output of bn layer directly from equation, the result is much different from result obtained from forward pass.
Can anyone explain this to me?
(Python 3.6, PyTorch 0.4.0)
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.module_list = nn.ModuleList()
module = nn.Sequential()
conv = nn.Conv2d(3, 32, 3, 1, 1, bias=False)
module.add_module('conv_0', conv)
bn = nn.BatchNorm2d(32)
module.add_module('batch_norm_0', bn)
gamma = np.random.rand(32)
gamma = torch.from_numpy(gamma)
bn.weight.data.copy_(gamma)
beta = np.random.rand(32)
beta = torch.from_numpy(beta)
bn.bias.data.copy_(beta)
mean = np.random.rand(32)
mean = torch.from_numpy(mean)
bn.running_mean.data.copy_(mean)
var = np.random.rand(32)
var = torch.from_numpy(var)
bn.running_var.data.copy_(var)
self.module_list.append(module)
def forward(self, input):
conv = self.module_list[0][0](input)
bn = self.module_list[0][1](conv)
return conv, bn
if __name__ == '__main__':
model = Model()
bn = model.module_list[0][1]
gamma = bn.weight.data.numpy().reshape(1, 32, 1, 1)
beta = bn.bias.data.numpy().reshape(1, 32, 1, 1)
mean = bn.running_mean.numpy().reshape(1, 32, 1, 1)
var = bn.running_var.numpy().reshape(1, 32, 1, 1)
x = np.random.rand(1, 3, 64, 64)
x = Variable(torch.from_numpy(x).float())
conv_out, bn_out = model.forward(x)
conv_out = conv_out.data.numpy()
x = ((conv_out - mean) / np.sqrt(var + 1e-05)) * gamma + beta
# I expect this to be negligible but is ~0.5
print(np.sum(np.abs(bn_out.data.numpy() - x))/np.prod(x.shape))