Hi,
I’m trying to implement a language model using LSTM which works on sentences (unlike the example in https://github.com/pytorch/examples/tree/master/word_language_model). I’m pretty sure I’m calculating my loss wrong because my validation loss is higher than my training loss.
My model:
class BiLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout_prob=0.5):
super(BiLSTM, self).__init__()
self.embedding_dim = embedding_dim
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout(p=dropout_prob)
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=self.num_layers, bidirectional=True, dropout=dropout_prob)
self.fc = nn.Linear(2 * hidden_size, vocab_size)
def forward(self, input, hidden):
out = self.dropout(self.embedding(input))
out = out.permute(1, 0, 2)
out, hidden = self.lstm(out, hidden)
out = self.dropout(out)
out = self.fc(out)
return out, hidden
def init_hidden(self, batch_size):
return (torch.zeros(2 * self.num_layers, batch_size, self.hidden_size).to(device),
torch.zeros(2 * self.num_layers, batch_size, self.hidden_size).to(device))
Batches are created using this:
def get_batches(data):
random.shuffle(data)
nbatches = len(data) // BATCH_SIZE
for i in range(0, nbatches):
batch = data[BATCH_SIZE * i : BATCH_SIZE * (i + 1)]
sent_tensors = []
target_tensors = []
for sent in batch:
sent_tensors.append(torch.tensor([w2i[word] for word in sent]).to(device))
target_tensors.append(torch.tensor([w2i[word] for word in sent[1:] + ['<eos>']]).to(device))
lengths = [len(sent) for sent in batch]
yield nn.utils.rnn.pad_sequence(sent_tensors, padding_value=PAD).t(), nn.utils.rnn.pad_sequence(target_tensors, padding_value=PAD).t(), lengths
and this is the training loop:
loss_function = nn.CrossEntropyLoss(ignore_index=PAD)
optimizer = optim.Adam(model.parameters())
train_losses = []
val_losses = []
epochs = 4
train_batches = list(get_batches(train))
valid_batches = list(get_batches(valid))
for epoch in range(epochs):
total_train_loss = 0
total_val_loss = 0
for batch in tqdm(train_batches):
X, y, lengths = batch
batch_size, seq_len = X.size()
model.zero_grad()
hidden = model.init_hidden(batch_size)
yhat, hidden = model(X, hidden)
yhat = yhat.permute(0, 2, 1)
loss = loss_function(yhat, y.t())
loss.backward()
optimizer.step()
total_train_loss += loss.item()
hidden = (hidden[0].detach(), hidden[1].detach())
train_losses.append(total_train_loss / len(train_batches))
with torch.no_grad():
for batch in tqdm(valid_batches):
X, y, lengths = batch
batch_size, seq_len = X.size()
model.zero_grad()
hidden = model.init_hidden(batch_size)
yhat, hidden = model(X, hidden)
yhat = yhat.permute(0, 2, 1)
loss = loss_function(yhat, y.t())
total_val_loss += loss.item()
hidden = (hidden[0].detach(), hidden[1].detach())
val_losses.append(total_val_loss / len(valid_batches))
plt.plot(train_losses)
plt.plot(val_losses);
Also, as you can see, I kinda got lost with pad_packed_sequence
and pack_padded_sequence
so in the end I just padded the sentences without doing much else.
Thanks for the help!