So i’ve implemented in PyTorch the same code as in Keras, despite using the same initialization (glorot) in PyTorch, same hyper-parameters, optimizer, loss etc…
I get much different results.
Both implementation use fastText pretrained embeddings.
I’ve read through the forum on similar cases (few posts) and thus tried initialization of glorot, 0 dropout, etc.
The dataset used is SemEval-2014 Task-3
, the sentence2phrase
part of it.
Which holds 500 training rows.
Both models use the same pre-processing (basically the same csv) file. Which has pre-0-padding, fixed length. Both use pretrained 300-dim wiki.en fastText embeddings.
The evaluation in such task is the pearson and spearman correlation coefficient.
The one i get on the Keras implementation is:
pearsonr: 0.6144410047441521 spearman: 0.6066296951306432
The one i get on the PyTorch implementation is:
pearsonr: 0.227 spearman: 0.214
(In the different tweaks and modifications i’ve made i was able to bring them to ~0.38+ but on different settings as the Keras implementation, such as taking the sum of the hidden states).
I’ll try to publish the code that matters first, not to put more code than needed, if something is missing, please let me know and i’ll update. Appreciate it.
Thank you!
PyTorch code:
Notes: I tried using both the framework’s MSELoss and the following implementation - there was no significant change in loss and correlation.
random_state = 0
torch.manual_seed(random_state)
class SiameseModel(nn.Module):
def __init__(
self,
hidden_size:int=None,
output_size:int=None,
dropout_in:int=0.0,
vocab_size:int=None,
embedding_dim:int=None,
pretrained_embeddings=None,
layers_no:int=1,
batch_size:int=None,
bidi:bool=True
):
super(SiameseModel, self).__init__()
self.word_embeds = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
# i also tried with loading manually pretrained fastText embeddings
#self.word_embeds , num_embeddings, embedding_dim = create_emb_layer(embedding_matrix, non_trainable=True)
self.sentence_encoder = nn.LSTM(
embedding_dim,
hidden_size,
num_layers=layers_no,
batch_first=True,
dropout=dropout_in,
bidirectional=bidi
)
self.directions_no = 2 if bidi else 1
self.batch_size = batch_size
def forward(self, x1, x2):
embed_sent1 = self.word_embeds(x1.long())
embed_sent2 = self.word_embeds(x2.long())
v1, (h1, c1) = self.sentence_encoder(embed_sent1)
v2, (h2, c2) = self.sentence_encoder(embed_sent2)
v1 = h1.permute((1, 0, 2))
v2 = h2.permute((1, 0, 2))
# i also tried using the sum of all the hidden states which gave better results
cosine_sim = torch.nn.functional.cosine_similarity(v1, v2, -1, 1e-8)
return cosine_sim
def mean_squared_error(y_pred, y_true):
return torch.mean(torch.pow(y_pred - y_true, 2), dim=-1, keepdim=True)
vocab_size = vocab_size
embedding_dim = 300
hidden_size = 100
output_size = 1
epochs_no = 100
layers_no = 1
lr = 1e-08 #0.01# 1e-3 # tried all of these learning rates
batch_size = 64
log_interval = 1
clip_max_norm = 1.25 # None # tried with / out clipping
model = SiameseModel(
hidden_size=hidden_size,
output_size=output_size,
vocab_size=vocab_size,
batch_size=batch_size,
layers_no=layers_no,
embedding_dim=embedding_dim,
bidi=bidi,
pretrained_embeddings=pretrained_embeddings
)
# criterion = nn.MSELoss(reduction='sum')
criterion = mean_squared_error
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
def pre_process_batch(x):
if isinstance(x[0], torch.Tensor) == False:
return torch.Tensor(x[0]), torch.Tensor(x[1])
return x[0], x[1]
def post_process_batch(x, ground_truth=False):
return x.float().squeeze()
preds = []
def initialize_weights(model):
if type(model) in [nn.Linear]:
nn.init.xavier_normal(model.weight.data)
elif type(model) in [nn.LSTM, nn.RNN, nn.GRU]:
nn.init.xavier_normal(model.weight_hh_l0)
nn.init.xavier_normal(model.weight_ih_l0)
model.apply(initialize_weights)
curr_global_step = 0
for epoch in range(epochs_no):
total_loss = 0.
model.train()
total_batch_files = 0
for batch_idx in range(0, len(X1_trn2), batch_size):
batch_idx //= batch_size
curr_global_step += 1
# batch_x1, batch_x2 sizes are: (64, 32)
batch_x1 = X1_trn2[batch_idx * batch_size: (batch_idx + 1) * batch_size]
batch_x2 = X2_trn2[batch_idx * batch_size: (batch_idx + 1) * batch_size]
# batch_y shape is [64]
batch_y = Y_trn2[batch_idx * batch_size: (batch_idx + 1) * batch_size]
batch_y = batch_y if isinstance(batch_y, torch.Tensor) else torch.Tensor(batch_y)
batch_x = [batch_x1, batch_x2]
total_batch_files += len(batch_y)
if pre_process_batch is not None:
# y_hat shape is [64, 1]
y_hat = model(*pre_process_batch(batch_x))
else:
y_hat = model(torch.Tensor(batch_x))
if post_process_batch is not None:
# y_hat shape is [64]
y_hat = post_process_batch(y_hat, ground_truth=False)
# batch_y shape is [64]
batch_y = post_process_batch(batch_y, ground_truth=True)
loss = criterion(y_hat, batch_y)
print(curr_global_step, loss.item())
optimizer.zero_grad() # zero the gradient buffer
loss.backward()
if clip_max_norm is not None:
torch.nn.utils.clip_grad_norm_(model.parameters(), clip_max_norm)
optimizer.step()
total_loss += loss.item()
print('train total_loss', total_loss / len(batch_y))
print('train final epoch total_loss', total_loss / total_batch_files)
model.eval()
total_loss = 0.
total_batch_files = 0
for batch_idx in range(0, len(X1_tst2), batch_size):
batch_idx //= batch_size
model.train()
batch_x1 = X1_tst2[batch_idx * batch_size: (batch_idx + 1) * batch_size]
batch_x2 = X2_tst2[batch_idx * batch_size: (batch_idx + 1) * batch_size]
batch_y = Y_tst2[batch_idx * batch_size: (batch_idx + 1) * batch_size]
batch_y = batch_y if isinstance(batch_y, torch.Tensor) else torch.Tensor(batch_y)
batch_x = [batch_x1, batch_x2]
total_batch_files += len(batch_y)
if pre_process_batch is not None:
y_hat = model(*pre_process_batch(batch_x))
else:
y_hat = model(torch.Tensor(batch_x))
if post_process_batch is not None:
# y_hat shape is [64]
y_hat = post_process_batch(y_hat, ground_truth=False)
batch_y = post_process_batch(batch_y, ground_truth=True)
loss = criterion(y_hat, batch_y)
print(curr_global_step, loss.item())
total_loss += loss.item()
print('test total_loss', total_loss / len(batch_y))
print('test final epoch total_loss', total_loss / total_batch_files)
The Keras implementation is based on the great code of https://github.com/TharinduDR/Siamese-Recurrent-Architectures/blob/master/MALSTM.ipynb
With some modification which gave the best results on the dataset.
gradient_clipping_norm = 1.25
adam = Adam(clipnorm=gradient_clipping_norm)
validation_portion=0.1
n_hidden=100
embedding_dim=300
batch_size=64
n_epoch=100
optimizer=adam
# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')
embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)
# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)
# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden, name="lstm")
left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)
# Calculates the distance as defined by the MaLSTM model
# original distance - gave results in the 0.40 range
# malstm_distance = Lambda(function=lambda x: exponent_neg_euclidean_distance(x[0], x[1]),
# output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
# Gave the best results, in 0.61 range.
malstm_distance = Dot(1, normalize=True)([left_output, right_output])
# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])
optimizer = optimizer
if load_weights is not None:
malstm.load_weights(load_weights, by_name=True)
malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1, validation_data=([X_validation['left'], X_validation['right']], Y_validation))
Thanks!