Training LSTM, loss not decreasing

Dsan10s · December 7, 2017, 6:23am

Hi all,

I’m training an LSTM as an encoder for sentences. My loss function is torch.nn.MultiMarginLoss with the default parameters. For more context, here’s a link to the paper:

Here’s my lstm implementation (as a note I’m feeding in batches of sentence word embedding vectors. Each has a variable length (length of the corresponding sentence) which is padded by pack_padded_sequence):

import torch
import torch.nn as nn
from torch.autograd import Variable

torch.manual_seed(1)

class LSTM(nn.Module):
    def __init__(self, config):
        super(LSTM, self).__init__()
        self.config = config
        self.lstm = nn.LSTM(input_size=config["input_size"],
                            hidden_size=config["hidden_size"],
                            num_layers=config["num_layers"],
                            dropout=config["dropout"],
                            bidirectional=config["bidirectional"])


    def forward(self, inputs, lengths=None):
        batch_size = inputs.size()[0]
        if lengths is not None:
            inputs = torch.nn.utils.rnn.pack_padded_sequence(inputs, lengths, True)

        state_shape = self.config["num_cells"], batch_size, self.config["hidden_size"]
        h0 = c0 = Variable(inputs.data.data.new(*state_shape).zero_())

        outputs, (ht, ct) = self.lstm(inputs, (h0, c0))
        if self.config["bidirectional"]:
            return ht[-2:].transpose(0, 1).contiguous().view(batch_size, -1)
        return ht[-1]

The following code is where I call my loss function:

criterion = torch.nn.MultiMarginLoss()
optimizer = optim.Adam(self.lstm.parameters(), lr=LEARNING_RATE)
...
for batch_num, (title_batch, body_batch, question_info_batch) in \
        enumerate(self.builder.getFeatures(self.train_path)):

    title_batch, title_lengths = self.builder.computePaddedBatch(title_batch)
    body_batch, body_lengths = self.builder.computePaddedBatch(body_batch)

    title_feature_matrix = self.getFeatureVariable(title_batch)
    body_feature_matrix = self.getFeatureVariable(body_batch)

    title_final_states = self.lstm(title_feature_matrix, title_lengths)
    body_final_states = self.lstm(body_feature_matrix, body_lengths)

    relevant_states = []
    retrieved_states = []
    q_final_state = None
    batch_id = None
    for j in range(len(title_final_states)):
        title_final_state = title_final_states[j:j+1]
        body_final_state = body_final_states[j:j+1]
        q_id, candidate_id, actual = question_info_batch[j]

        final_state = (title_final_state+body_final_state)/2.0

        question_info = self.builder.getQuestionInfo(q_id)
        title = question_info["title"]
        body = question_info["body"]

        title_vector = self.builder.getSentenceVector(title)
        body_vector = self.builder.getSentenceVector(body)

        title_feature_vector = self.getFeatureVariable([title_vector])
        body_feature_vector = self.getFeatureVariable([body_vector])

        q_title_final_state = self.lstm(title_feature_vector, [len(title_vector)])
        q_body_final_state = self.lstm(body_feature_vector, [len(body_vector)])

        q_final_state = (q_title_final_state+q_body_final_state)/2.0

        if actual:
            relevant_states.append((candidate_id, final_state))
        else:
            retrieved_states.append((candidate_id, final_state))

    x = []
    y = []
    scores = []
    neg_sims = []

    assert len(relevant_states) > 0
    assert len(retrieved_states) > 0

    for retrieved_id, retrieved_state in retrieved_states:
        sim = f.cosine_similarity(q_final_state, retrieved_state)
        neg_sims.append(sim)

    for relevant_id, relevant_state in relevant_states:
        sim = f.cosine_similarity(q_final_state, relevant_state)
        x.append(sim)
        x = x + neg_sims

        y.append(0)

    x = torch.cat(x)
    x = x.view(len(relevant_states), len(neg_sims)+1)

    y = Variable(torch.LongTensor(y))

    optimizer.zero_grad()
    loss = criterion(x, y)
    loss.backward()
    optimizer.step()

If there’s any additional I can provide that would help with answering the question please let me know!