Thank you so much!! It worked, I was spending days to find out what exactly is being reused . I just need one more small thing to be verified by you. I converted a TF code to pytorch. However, when I print the trainable model params, the pytorch code just gives exactly half of what TF code prints. Following are the codes:
sess = tf.Session()
# placeholders
encoder_inputs = tf.placeholder(shape=(None, input_length), dtype=tf.float32, name='encoder_inputs')
prior_K = tf.placeholder(shape=(None, None), dtype=tf.float32, name='prior_K')
# ----- ENCODER -----
We1 = tf.Variable(
tf.random_uniform((input_length, args.hidden_size), -1.0 / math.sqrt(input_length), 1.0 / math.sqrt(input_length)))
We2 = tf.Variable(tf.random_uniform((args.hidden_size, args.code_size), -1.0 / math.sqrt(args.hidden_size),
1.0 / math.sqrt(args.hidden_size)))
be1 = tf.Variable(tf.zeros([args.hidden_size]))
be2 = tf.Variable(tf.zeros([args.code_size]))
hidden_1 = tf.nn.tanh(tf.matmul(encoder_inputs, We1) + be1)
code = tf.nn.tanh(tf.matmul(hidden_1, We2) + be2)
# kernel on codes
code_K = tf.tensordot(code, tf.transpose(code), axes=1)
# ----- DECODER -----
if tied_weights:
Wd1 = tf.transpose(We2)
Wd2 = tf.transpose(We1)
else:
Wd1 = tf.Variable(tf.random_uniform((args.code_size, args.hidden_size), -1.0 / math.sqrt(args.code_size),
1.0 / math.sqrt(args.code_size)))
Wd2 = tf.Variable(tf.random_uniform((args.hidden_size, input_length), -1.0 / math.sqrt(args.hidden_size),
1.0 / math.sqrt(args.hidden_size)))
bd1 = tf.Variable(tf.zeros([args.hidden_size]))
bd2 = tf.Variable(tf.zeros([input_length]))
if lin_dec:
hidden_2 = tf.matmul(code, Wd1) + bd1
else:
hidden_2 = tf.nn.tanh(tf.matmul(code, Wd1) + bd1)
dec_out = tf.matmul(hidden_2, Wd2) + bd2
# ----- LOSS -----
# kernel alignment loss with normalized Frobenius norm
code_K_norm = code_K / tf.norm(code_K, ord='fro', axis=[-2, -1])
prior_K_norm = prior_K / tf.norm(prior_K, ord='fro', axis=[-2, -1])
k_loss = tf.norm(code_K_norm - prior_K_norm, ord='fro', axis=[-2,-1])
And my converted Pytorch code is:
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.We1 = torch.nn.Parameter(torch.Tensor(input_length, args.hidden_size).uniform_(-1.0 / math.sqrt(input_length), 1.0 / math.sqrt(input_length)))
self.We2 = torch.nn.Parameter(torch.Tensor(args.hidden_size, args.code_size).uniform_(-1.0 / math.sqrt(args.hidden_size), 1.0 / math.sqrt(args.hidden_size)))
self.be1 = torch.nn.Parameter(torch.zeros([args.hidden_size]))
self.be2 = torch.nn.Parameter(torch.zeros([args.code_size]))
def encoder(self, encoder_inputs):
hidden_1 = torch.tanh(torch.matmul(encoder_inputs.float(), self.We1) + self.be1)
code = torch.tanh(torch.matmul(hidden_1, self.We2) + self.be2)
return code
def decoder(self,encoder_inputs):
# hidden_1 = torch.tanh(torch.matmul(encoder_inputs.float(), self.We1) + self.be1)
# code = torch.tanh(torch.matmul(hidden_1, self.We2) + self.be2)
code = self.encoder(encoder_inputs)
# ----- DECODER -----
if tied_weights:
Wd1 = torch.transpose(We2)
Wd2 = torch.transpose(We1)
else:
Wd1 = torch.nn.Parameter(
torch.Tensor(args.code_size, args.hidden_size).uniform_(-1.0 / math.sqrt(args.code_size),
1.0 / math.sqrt(args.code_size)))
Wd2 = torch.nn.Parameter(
torch.Tensor(args.hidden_size, input_length).uniform_(-1.0 / math.sqrt(args.hidden_size),
1.0 / math.sqrt(args.hidden_size)))
bd1 = torch.nn.Parameter(torch.zeros([args.hidden_size]))
bd2 = torch.nn.Parameter(torch.zeros([input_length]))
if lin_dec:
hidden_2 = torch.matmul(code, Wd1) + bd1
else:
hidden_2 = torch.tanh(torch.matmul(code, Wd1) + bd1)
dec_out = torch.matmul(hidden_2, Wd2) + bd2
return dec_out
def kernel_loss(self,code, prior_K):
# kernel on codes
code_K = torch.mm(code, torch.t(code))
# ----- LOSS -----
# kernel alignment loss with normalized Frobenius norm
code_K_norm = code_K / torch.linalg.matrix_norm(code_K, ord='fro', dim=(- 2, - 1))
prior_K_norm = prior_K / torch.linalg.matrix_norm(prior_K, ord='fro', dim=(- 2, - 1))
k_loss = torch.linalg.matrix_norm(torch.sub(code_K_norm,prior_K_norm), ord='fro', dim=(- 2, - 1))
return k_loss
# Initialize model
model = Model()
Do you see anything seriously wrong here? I get exactly half training params and I guess this is affecting the gradients during backprop as well as I am not getting similar results.
Thanks a lot! Regards