Wrong Loss Calculation?

Hi,

I’m trying to implement a language model using LSTM which works on sentences (unlike the example in https://github.com/pytorch/examples/tree/master/word_language_model). I’m pretty sure I’m calculating my loss wrong because my validation loss is higher than my training loss.

My model:

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout_prob=0.5):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(p=dropout_prob)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=self.num_layers, bidirectional=True, dropout=dropout_prob)
        self.fc = nn.Linear(2 * hidden_size, vocab_size)
    
    def forward(self, input, hidden):
        out = self.dropout(self.embedding(input))
        out = out.permute(1, 0, 2)
        out, hidden = self.lstm(out, hidden)
        out = self.dropout(out)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        return (torch.zeros(2 * self.num_layers, batch_size, self.hidden_size).to(device),
                torch.zeros(2 * self.num_layers, batch_size, self.hidden_size).to(device))

Batches are created using this:

def get_batches(data):
    random.shuffle(data)
    nbatches = len(data) // BATCH_SIZE
    for i in range(0, nbatches):
        batch = data[BATCH_SIZE * i : BATCH_SIZE * (i + 1)]
        sent_tensors = []
        target_tensors = []
        for sent in batch:
            sent_tensors.append(torch.tensor([w2i[word] for word in sent]).to(device))
            target_tensors.append(torch.tensor([w2i[word] for word in sent[1:] + ['<eos>']]).to(device))
        lengths = [len(sent) for sent in batch]
        yield nn.utils.rnn.pad_sequence(sent_tensors, padding_value=PAD).t(),  nn.utils.rnn.pad_sequence(target_tensors, padding_value=PAD).t(), lengths

and this is the training loop:

loss_function = nn.CrossEntropyLoss(ignore_index=PAD)
optimizer = optim.Adam(model.parameters())
train_losses = []
val_losses = []
epochs = 4
train_batches = list(get_batches(train))
valid_batches = list(get_batches(valid))
for epoch in range(epochs):
    total_train_loss = 0
    total_val_loss = 0
    for batch in tqdm(train_batches):
        X, y, lengths = batch
        batch_size, seq_len = X.size()
        model.zero_grad()
        hidden = model.init_hidden(batch_size)
        yhat, hidden = model(X, hidden)
        yhat = yhat.permute(0, 2, 1)
        loss = loss_function(yhat, y.t())
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        hidden = (hidden[0].detach(), hidden[1].detach())
    train_losses.append(total_train_loss / len(train_batches))
    with torch.no_grad():
        for batch in tqdm(valid_batches):
            X, y, lengths = batch
            batch_size, seq_len = X.size()
            model.zero_grad()
            hidden = model.init_hidden(batch_size)
            yhat, hidden = model(X, hidden)
            yhat = yhat.permute(0, 2, 1)
            loss = loss_function(yhat, y.t())
            total_val_loss += loss.item()
            hidden = (hidden[0].detach(), hidden[1].detach())
        val_losses.append(total_val_loss / len(valid_batches))
plt.plot(train_losses)
plt.plot(val_losses);

Also, as you can see, I kinda got lost with pad_packed_sequence and pack_padded_sequence so in the end I just padded the sentences without doing much else.

Thanks for the help!