I’m trying to implement my own language model. I also had a look at Pytorch’s official language model example. My code seems very similar but it’s not working.
Here’s my model:
class LM(nn.Module):
def __init__(self, nlayers, dropout, edim, vsz, hdim, go_idx, pad_idx, tie_weights, device):
super().__init__()
self.nlayers = nlayers
self.dropout = dropout
self.edim = edim
self.vsz = vsz
self.hdim = hdim
self.go_idx = go_idx
self.pad_idx = pad_idx
self.embed = nn.Embedding(num_embeddings=vsz, embedding_dim=edim, padding_idx=pad_idx, _weight=None)
self.rnn = nn.LSTM(input_size=edim, hidden_size=hdim, num_layers=nlayers,
bias=True, batch_first=True, dropout=dropout, bidirectional=False)
self.outlayer = nn.Linear(in_features=hdim, out_features=vsz)
if tie_weights and edim == self.hdim:
self.outlayer.weight = self.embed.weight
self.tie_weights = True
else:
self.tie_weights = False
self.init_weights()
self.device = device
self.to(device)
def init_weights(self, initrange=0.1):
self.embed.weight.data.uniform_(-initrange, initrange)
self.outlayer.bias.data.zero_()
if not self.tie_weights:
self.outlayer.weight.data.uniform_(-initrange, initrange)
def init_hiddens_zeros(self, bsz):
return (torch.zeros(self.nlayers, bsz, self.hdim, device=self.device),
torch.zeros(self.nlayers, bsz, self.hdim, device=self.device))
def forward(self, inputs):
bsz, seqlen = inputs.size()
embedded = F.dropout(self.embed(inputs), p=self.dropout)
hiddens = self.init_hiddens_zeros(bsz)
outputs, hiddens = self.rnn(embedded, hiddens)
outputs = F.dropout(outputs, p=self.dropout)
outputs = outputs.contiguous().view(bsz*seqlen, -1)
outputs = self.outlayer(outputs)
outputs = outputs.contiguous().view(bsz, seqlen, -1)
return outputs
I’m using torchtext for data reading and mini batching; as follows:
go_token='<go>'
eos_token='<eos>'
pad_token='<pad>'
unk_token='<unk>'
tokenize=str.split
src = trg = torchtext.data.Field(sequential=True, use_vocab=True, init_token=go_token,
eos_token=eos_token, fix_length=None, dtype=torch.long,
preprocessing=None, postprocessing=None, lower=True,
tokenize=tokenize, include_lengths=True, batch_first=True,
pad_token=pad_token, unk_token=unk_token, pad_first=False, truncate_first=False)
train_ds = torchtext.datasets.TranslationDataset(
path='data/ptb.train', exts=('.txt', '.txt'),
fields=(src, trg))
src.build_vocab(train_ds, max_size=None)
Then I define my model, optimizer and learning rate scheduler as follows:
vsz = len(src.vocab)
eos_idx = src.vocab.stoi[eos_token]
go_idx = src.vocab.stoi[go_token]
unk_idx = src.vocab.stoi[unk_token]
pad_idx = src.vocab.stoi[pad_token]
nlayers = 2
dropout = .5
hdim = 200
edim = 200
tie_weights = True
device = torch.device("cuda:0")
lm = LM(nlayers, dropout, edim, vsz, hdim, go_idx, pad_idx, tie_weights, device)
lr=20
optimizer = torch.optim.SGD(lm.parameters(), lr=lr)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
factor=0.8, patience=1,
verbose=False, threshold=0.0001,
threshold_mode='rel', cooldown=0,
min_lr=0, eps=1e-08)
Finally here’s my training loop:
bsz = 32
train_iter = torchtext.data.BucketIterator(dataset=train_ds, batch_size=bsz,
sort=True, sort_within_batch=True,
sort_key=lambda x: len(x.src), device=device)
clip = .25
nepochs = 100
for epoch in range(nepochs):
ep_st = time.time()
epoch_loss = 0
for batch in train_iter:
lm.zero_grad()
src_b, lengths = batch.src
bsz, seqlen = src_b.size()
seqlen -= 1
logits = lm(src_b[:, :-1])
loss = F.cross_entropy(logits.view(bsz*seqlen, -1), src_b[:, 1:].contiguous().view(-1), ignore_index=pad_idx)
loss.backward()
clip_grad_norm_(lm.parameters(), clip)
optimizer.step()
# track loss
epoch_loss += float(loss.item())
#end of epoch
if train_iter.epoch > epoch:
lr_scheduler.step(epoch_loss)
ep_et = time.time()
#log
print("End of epoch {} | time elapsed {:5.3f} | LM loss {:5.3f} | LR {}".
format(epoch, (ep_et-ep_st)/60, epoch_loss, optimizer.param_groups[0]['lr'])
)
print("Predictions..")
outwords = F.softmax(logits, dim=-1)
values, indices = torch.topk(outwords, 1)
sentences = [' '.join(src.vocab.itos[idx] for idx in sent[:lengths[i]]) for i, sent in enumerate(indices.squeeze().cpu().numpy().astype(int))]
print(sentences[::5])
print("Targets..")
sentences = [' '.join(src.vocab.itos[idx] for idx in sent[:lengths[i]]) for i, sent in enumerate(src_b.squeeze().cpu().numpy().astype(int))]
print(sentences[::5])
print('-'*50)
break
Here are my imports:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
import torchtext
import numpy as np
import time
Any ideas why isn’t it learning? I appreciate your help.