I’ve tested it against TensorFlow/Keras outputs, it does not work (output is different). I’ve found following conversion to work for GRU layer (TensorFlow r2.4rc0 - tf.keras.layers.GRU
to PyTorch 1.9.0 - torch.nn.GRU
):
- Save weights from TensorFlow model (for example to
.npz
file):
import random
import numpy as np
import tensorflow as tf
SEED=1995
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
gru = tf.keras.layers.GRU(
units=5,
return_sequences=True,
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=SEED),
recurrent_initializer=tf.keras.initializers.Orthogonal(seed=SEED),
bias_initializer=tf.keras.initializers.Zeros
)
y_tf = gru(tf.ones((1, 3, 5)), training=False) # forward pass with ones
np.savez(
'tf_model_weights.npz',
gru_kernel=gru.weights[0].numpy(),
gru_recurrent_kernel=gru.weights[1].numpy(),
gru_bias=gru.weights[2].numpy()
)
- Load weights into PyTorch model:
import random as r
import numpy as np
import torch
SEED=1995
torch.set_printoptions(precision=8)
r.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
from speechbrain.nnet.RNN import GRU, LSTM
npz_weights = np.load('tf_model_weights.npz')
def convert_input_kernel(kernel):
kernel_r, kernel_z, kernel_h = np.hsplit(kernel, 3)
return np.concatenate((kernel_z.T, kernel_r.T, kernel_h.T))
def convert_recurrent_kernel(kernel):
kernel_r, kernel_z, kernel_h = np.hsplit(kernel, 3)
return np.concatenate((kernel_z.T, kernel_r.T, kernel_h.T))
def convert_bias(bias):
bias = bias.reshape(2, 3, -1)
return bias[:, [1, 0, 2], :].reshape((2, -1))
gru = torch.nn.GRU(
hidden_size=5,
input_size=5,
num_layers=1,
bidirectional=False,
batch_first=True
)
for pn, p in gru.named_parameters():
if 'weight_ih' in pn:
p.data = torch.from_numpy(convert_input_kernel(npz_weights['gru_kernel']))
elif 'weight_hh' in pn:
p.data = torch.from_numpy(convert_recurrent_kernel(npz_weights['gru_recurrent_kernel']))
elif 'bias_ih' in pn:
p.data = torch.from_numpy(convert_bias(npz_weights['gru_bias'])[0])
else:
p.data = torch.from_numpy(convert_bias(npz_weights['gru_bias'])[1])
- Test output:
TensorFlow:
>>> y_tf
<tf.Tensor: shape=(1, 3, 5), dtype=float32, numpy=
array([[[-0.36656722, -0.4693069 , -0.16722648, 0.36081928, 0.1643753 ],
[-0.4628504 , -0.6815055 , -0.18605384, 0.58125013, 0.2494137 ],
[-0.48067108, -0.7698146 , -0.16238967, 0.70518744, 0.3005259 ]]], dtype=float32)>
PyTorch:
>>> gru.eval()
GRU(5, 5, batch_first=True)
>>> y_pt, _ = gru(torch.ones(1, 3, 5))
>>> y_pt
tensor([[[-0.36656719, -0.46930692, -0.16722649, 0.36081928, 0.16437532],
[-0.46285039, -0.68150550, -0.18605389, 0.58125013, 0.24941370],
[-0.48067111, -0.76981461, -0.16238970, 0.70518738, 0.30052590]]],
grad_fn=<TransposeBackward1>)
Hope someone will find this helpful.