Hi,
I have a trained model in Keras (tensorflow backend) and want to transfer those weights to a pytorch model. As I do it, the model in pytorch performance not as good as the keras model does. Even the forward propagation has differences. To nail the problem down I created a small toy example to see if this situation can be replicated. And it can easily be replicated with the given script:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
import keras
import keras.backend as K
import torch
from torch import nn
import numpy as np
import random
import tensorflow as tf
np.set_printoptions(precision=12)
tf.set_random_seed(42)
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
def keras_to_pyt(km, pm):
weight_dict = dict()
for layer in km.layers:
if type(layer) is keras.layers.convolutional.Conv2D:
weight_dict[layer.get_config()['name'] + '.weight'] = np.transpose(layer.get_weights()[0], (3, 2, 0, 1))
weight_dict[layer.get_config()['name'] + '.bias'] = layer.get_weights()[1]
elif type(layer) is keras.layers.Dense:
weight_dict[layer.get_config()['name'] + '.weight'] = np.transpose(layer.get_weights()[0], (1, 0))
weight_dict[layer.get_config()['name'] + '.bias'] = layer.get_weights()[1]
pyt_state_dict = pm.state_dict()
for key in pyt_state_dict.keys():
pyt_state_dict[key] = torch.from_numpy(weight_dict[key])
pm.load_state_dict(pyt_state_dict)
return pm
inp = np.random.normal(size=(1, 1, 5, 6)).astype(dtype=np.float32)
inp_pyt = torch.autograd.Variable(torch.from_numpy(inp.copy()).float())
inp_keras = np.transpose(inp.copy(), (0, 2, 3, 1))
a = keras.Input(shape=(5, 6, 1), name='input')
b = keras.layers.Conv2D(2, (3, 4), activation='linear', padding='same', name='conv_1', bias_initializer='random_uniform')(a)
keras_model = keras.models.Model(inputs=a, outputs=b)
class PyNet(nn.Module):
def __init__(self):
super(PyNet, self).__init__()
self.conv_1 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(3, 4), padding=0)
def forward(self, x):
return self.conv_1(nn.ZeroPad2d((1, 2, 1, 1))(x))
pyt_model = PyNet()
keras_result = keras_model.predict(x=inp_keras, verbose=1)
pyt_model = keras_to_pyt(keras_model, pyt_model)
pyt_res = np.transpose(pyt_model(inp_pyt).data.numpy(), (0, 2, 3, 1))
for i in range(1):
for j in range(5):
for k in range(6):
for l in range(2):
print(keras_result[i, j, k, l], pyt_res[i, j, k, l])
The given script prints the following results:
(-0.044850744, -0.044850744)
(-0.006462127, -0.0064621493)
(0.017427735, 0.017427728)
(0.37132108, 0.37132108)
(-0.23686403, -0.23686403)
(-0.90041882, -0.90041882)
(-0.065025821, -0.065025814)
(0.23733595, 0.23733595)
(-0.11710706, -0.11710706)
(0.24237445, 0.24237445)
(-0.061176896, -0.061176896)
(-0.056127474, -0.056127474)
(0.11819147, 0.11819144)
(0.10230125, 0.10230123)
(0.60257965, 0.60257971)
(0.91219217, 0.91219229)
(-0.21988741, -0.21988741)
(-0.94492501, -0.94492507)
(-0.25544429, -0.25544426)
(0.80861402, 0.80861402)
(-0.32262391, -0.32262391)
(0.29116583, 0.2911658)
(-0.063009739, -0.063009739)
(-0.099875651, -0.099875651)
(0.34689391, 0.34689391)
(0.86204314, 0.86204308)
(-0.15171689, -0.1517169)
(0.54282308, 0.5428232)
(0.002491869, 0.0024918541)
(-0.43892303, -0.43892306)
(0.25317714, 0.25317714)
(-0.15906075, -0.15906072)
(-0.12131988, -0.12131988)
(0.27651906, 0.27651903)
(0.19103783, 0.19103783)
(-0.28911468, -0.28911468)
(-0.8152504, -0.8152504)
(0.62633103, 0.62633109)
(-0.70274156, -0.70274156)
(-0.22379526, -0.22379526)
(0.043730669, 0.043730669)
(-0.87990582, -0.87990582)
(-0.27177343, -0.27177346)
(-0.0016308948, -0.0016309395)
(0.48494944, 0.48494944)
(0.15391195, 0.15391195)
(-0.062737577, -0.062737577)
(-0.19160414, -0.19160414)
(0.13429669, 0.13429666)
(0.40462545, 0.40462548)
(0.65125835, 0.65125835)
(-0.49792019, -0.49792019)
(-0.1081684, -0.10816839)
(-0.20262283, -0.2026228)
(-0.37794596, -0.37794593)
(-0.21728748, -0.21728748)
(-0.33614561, -0.33614561)
(0.56259048, 0.56259054)
(0.090251423, 0.090251423)
(-0.32884693, -0.3288469)
As you can see, some of the values has some difference on the order of 10^-5. This can be a simple float point error but is consistent with every kind of model. If this is the case with one layer, the problem will simply magnify when the model is big consisting of multiple layers.
Is there any solution to losing such floating point accuracy while transferring weights? Or should I train the model in PyTorch from scratch?