So i’ve implemented in PyTorch the same code as in Keras, despite using the same initialization (glorot) in PyTorch, same hyper-parameters, optimizer, loss etc…
I get much different results.
Both implementation use fastText pretrained embeddings.
I’ve read through the forum on similar cases (few posts) and thus tried initialization of glorot, 0 dropout, etc.
The dataset used is
SemEval-2014 Task-3 , the
sentence2phrase part of it.
Which holds 500 training rows.
Both models use the same pre-processing (basically the same csv) file. Which has pre-0-padding, fixed length. Both use pretrained 300-dim wiki.en fastText embeddings.
The evaluation in such task is the pearson and spearman correlation coefficient.
The one i get on the Keras implementation is:
pearsonr: 0.6144410047441521 spearman: 0.6066296951306432
The one i get on the PyTorch implementation is:
pearsonr: 0.227 spearman: 0.214
(In the different tweaks and modifications i’ve made i was able to bring them to ~0.38+ but on different settings as the Keras implementation, such as taking the sum of the hidden states).
I’ll try to publish the code that matters first, not to put more code than needed, if something is missing, please let me know and i’ll update. Appreciate it.
Notes: I tried using both the framework’s MSELoss and the following implementation - there was no significant change in loss and correlation.
random_state = 0 torch.manual_seed(random_state) class SiameseModel(nn.Module): def __init__( self, hidden_size:int=None, output_size:int=None, dropout_in:int=0.0, vocab_size:int=None, embedding_dim:int=None, pretrained_embeddings=None, layers_no:int=1, batch_size:int=None, bidi:bool=True ): super(SiameseModel, self).__init__() self.word_embeds = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True) # i also tried with loading manually pretrained fastText embeddings #self.word_embeds , num_embeddings, embedding_dim = create_emb_layer(embedding_matrix, non_trainable=True) self.sentence_encoder = nn.LSTM( embedding_dim, hidden_size, num_layers=layers_no, batch_first=True, dropout=dropout_in, bidirectional=bidi ) self.directions_no = 2 if bidi else 1 self.batch_size = batch_size def forward(self, x1, x2): embed_sent1 = self.word_embeds(x1.long()) embed_sent2 = self.word_embeds(x2.long()) v1, (h1, c1) = self.sentence_encoder(embed_sent1) v2, (h2, c2) = self.sentence_encoder(embed_sent2) v1 = h1.permute((1, 0, 2)) v2 = h2.permute((1, 0, 2)) # i also tried using the sum of all the hidden states which gave better results cosine_sim = torch.nn.functional.cosine_similarity(v1, v2, -1, 1e-8) return cosine_sim def mean_squared_error(y_pred, y_true): return torch.mean(torch.pow(y_pred - y_true, 2), dim=-1, keepdim=True) vocab_size = vocab_size embedding_dim = 300 hidden_size = 100 output_size = 1 epochs_no = 100 layers_no = 1 lr = 1e-08 #0.01# 1e-3 # tried all of these learning rates batch_size = 64 log_interval = 1 clip_max_norm = 1.25 # None # tried with / out clipping model = SiameseModel( hidden_size=hidden_size, output_size=output_size, vocab_size=vocab_size, batch_size=batch_size, layers_no=layers_no, embedding_dim=embedding_dim, bidi=bidi, pretrained_embeddings=pretrained_embeddings ) # criterion = nn.MSELoss(reduction='sum') criterion = mean_squared_error optimizer = torch.optim.Adam(model.parameters(), lr=lr) def pre_process_batch(x): if isinstance(x, torch.Tensor) == False: return torch.Tensor(x), torch.Tensor(x) return x, x def post_process_batch(x, ground_truth=False): return x.float().squeeze() preds =  def initialize_weights(model): if type(model) in [nn.Linear]: nn.init.xavier_normal(model.weight.data) elif type(model) in [nn.LSTM, nn.RNN, nn.GRU]: nn.init.xavier_normal(model.weight_hh_l0) nn.init.xavier_normal(model.weight_ih_l0) model.apply(initialize_weights) curr_global_step = 0 for epoch in range(epochs_no): total_loss = 0. model.train() total_batch_files = 0 for batch_idx in range(0, len(X1_trn2), batch_size): batch_idx //= batch_size curr_global_step += 1 # batch_x1, batch_x2 sizes are: (64, 32) batch_x1 = X1_trn2[batch_idx * batch_size: (batch_idx + 1) * batch_size] batch_x2 = X2_trn2[batch_idx * batch_size: (batch_idx + 1) * batch_size] # batch_y shape is  batch_y = Y_trn2[batch_idx * batch_size: (batch_idx + 1) * batch_size] batch_y = batch_y if isinstance(batch_y, torch.Tensor) else torch.Tensor(batch_y) batch_x = [batch_x1, batch_x2] total_batch_files += len(batch_y) if pre_process_batch is not None: # y_hat shape is [64, 1] y_hat = model(*pre_process_batch(batch_x)) else: y_hat = model(torch.Tensor(batch_x)) if post_process_batch is not None: # y_hat shape is  y_hat = post_process_batch(y_hat, ground_truth=False) # batch_y shape is  batch_y = post_process_batch(batch_y, ground_truth=True) loss = criterion(y_hat, batch_y) print(curr_global_step, loss.item()) optimizer.zero_grad() # zero the gradient buffer loss.backward() if clip_max_norm is not None: torch.nn.utils.clip_grad_norm_(model.parameters(), clip_max_norm) optimizer.step() total_loss += loss.item() print('train total_loss', total_loss / len(batch_y)) print('train final epoch total_loss', total_loss / total_batch_files) model.eval() total_loss = 0. total_batch_files = 0 for batch_idx in range(0, len(X1_tst2), batch_size): batch_idx //= batch_size model.train() batch_x1 = X1_tst2[batch_idx * batch_size: (batch_idx + 1) * batch_size] batch_x2 = X2_tst2[batch_idx * batch_size: (batch_idx + 1) * batch_size] batch_y = Y_tst2[batch_idx * batch_size: (batch_idx + 1) * batch_size] batch_y = batch_y if isinstance(batch_y, torch.Tensor) else torch.Tensor(batch_y) batch_x = [batch_x1, batch_x2] total_batch_files += len(batch_y) if pre_process_batch is not None: y_hat = model(*pre_process_batch(batch_x)) else: y_hat = model(torch.Tensor(batch_x)) if post_process_batch is not None: # y_hat shape is  y_hat = post_process_batch(y_hat, ground_truth=False) batch_y = post_process_batch(batch_y, ground_truth=True) loss = criterion(y_hat, batch_y) print(curr_global_step, loss.item()) total_loss += loss.item() print('test total_loss', total_loss / len(batch_y)) print('test final epoch total_loss', total_loss / total_batch_files)
The Keras implementation is based on the great code of
With some modification which gave the best results on the dataset.
gradient_clipping_norm = 1.25 adam = Adam(clipnorm=gradient_clipping_norm) validation_portion=0.1 n_hidden=100 embedding_dim=300 batch_size=64 n_epoch=100 optimizer=adam # The visible layer left_input = Input(shape=(max_seq_length,), dtype='int32') right_input = Input(shape=(max_seq_length,), dtype='int32') embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False) # Embedded version of the inputs encoded_left = embedding_layer(left_input) encoded_right = embedding_layer(right_input) # Since this is a siamese network, both sides share the same LSTM shared_lstm = LSTM(n_hidden, name="lstm") left_output = shared_lstm(encoded_left) right_output = shared_lstm(encoded_right) # Calculates the distance as defined by the MaLSTM model # original distance - gave results in the 0.40 range # malstm_distance = Lambda(function=lambda x: exponent_neg_euclidean_distance(x, x), # output_shape=lambda x: (x, 1))([left_output, right_output]) # Gave the best results, in 0.61 range. malstm_distance = Dot(1, normalize=True)([left_output, right_output]) # Pack it all up into a model malstm = Model([left_input, right_input], [malstm_distance]) optimizer = optimizer if load_weights is not None: malstm.load_weights(load_weights, by_name=True) malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy']) malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch,verbose=1, validation_data=([X_validation['left'], X_validation['right']], Y_validation))