Hi all, I am working with the Quora Question Pairs dataset, and I have constructed a Siamese LSTM model for this task, with a GloVe embedding layer. I am using the Adam optimizer with a learning rate of 0.002, as well as nn.CrossEntropyLoss(). My model’s training loss never seems to go down as I train for more epochs, which is really concerning to me. Additionally, the training accuracy hovers around 63%, which for this dataset is what happens when you predict that all questions are similar by default. I have provided my model code below and would appreciate any pointers as to what I may be doing wrong. I have experimented with changing the learning rate and I am looking at using dropout as a next step. Thanks!
class SiameseLSTM(nn.Module):
def __init__(self, embedding_dim, hidden_dim, batch_size, num_lstm_layers):
super(SiameseLSTM, self).__init__()
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
self.batch_size = batch_size
self.num_lstm_layers = num_lstm_layers
self.word_embeddings, _, _ = create_embedding_layer(weights_matrix, True)
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm = nn.LSTM(
embedding_dim,
hidden_dim,
num_layers=self.num_lstm_layers,
batch_first=True
)
# The linear layer that maps from hidden state space to tag space
# self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
self.hidden1 = self.init_hidden()
self.hidden2 = self.init_hidden()
self.fc1 = nn.Sequential(
nn.Linear(4 * self.hidden_dim + 2 * (MAX_SENTENCE_LENGTH * self.hidden_dim), 3 * self.hidden_dim),
nn.ReLU(inplace=True),
nn.Linear(3 * self.hidden_dim, self.hidden_dim),
nn.Linear(self.hidden_dim, 2),
nn.Softmax(),
)
def init_hidden(self):
# Before we've done anything, we dont have any hidden state.
# Refer to the Pytorch documentation to see exactly
# why they have this dimensionality.
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
# Try torch.randn instead of torch.zeros
return (torch.randn(self.num_lstm_layers, self.batch_size, self.hidden_dim),
torch.randn(self.num_lstm_layers, self.batch_size, self.hidden_dim))
def forward(self, sentence1, sentence2):
embeds1 = self.word_embeddings(sentence1.long())
embeds2 = self.word_embeddings(sentence2.long())
lstm1_out, self.hidden1 = self.lstm(embeds1, self.hidden1)
lstm2_out, self.hidden2 = self.lstm(embeds2, self.hidden2)
concat_all = torch.cat((
lstm1_out.contiguous().view(self.batch_size, MAX_SENTENCE_LENGTH * self.hidden_dim),
self.hidden1[0].view(self.batch_size, self.hidden_dim),
self.hidden1[1].view(self.batch_size, self.hidden_dim),
lstm2_out.contiguous().view(self.batch_size, MAX_SENTENCE_LENGTH * self.hidden_dim),
self.hidden2[0].view(self.batch_size, self.hidden_dim),
self.hidden2[1].view(self.batch_size, self.hidden_dim),
), dim=1)
output = self.fc1(concat_all)
# probability = torch.exp(-torch.abs(output[0][0] - output[0][1])).view(1, 1)
return output