Why isn't my RNN language model learning?

I’m trying to implement my own language model. I also had a look at Pytorch’s official language model example. My code seems very similar but it’s not working.
Here’s my model:

class LM(nn.Module):
    def __init__(self, nlayers, dropout, edim, vsz, hdim, go_idx, pad_idx, tie_weights, device):

        self.nlayers = nlayers
        self.dropout = dropout
        self.edim = edim
        self.vsz = vsz
        self.hdim = hdim
        self.go_idx = go_idx
        self.pad_idx = pad_idx
        self.embed = nn.Embedding(num_embeddings=vsz, embedding_dim=edim, padding_idx=pad_idx, _weight=None)
        self.rnn = nn.LSTM(input_size=edim, hidden_size=hdim, num_layers=nlayers,
                                bias=True, batch_first=True, dropout=dropout, bidirectional=False)

        self.outlayer = nn.Linear(in_features=hdim, out_features=vsz)
        if tie_weights and edim == self.hdim:
            self.outlayer.weight = self.embed.weight
            self.tie_weights = True
            self.tie_weights = False

        self.device = device
    def init_weights(self, initrange=0.1):
        self.embed.weight.data.uniform_(-initrange, initrange)
        if not self.tie_weights:
            self.outlayer.weight.data.uniform_(-initrange, initrange)

    def init_hiddens_zeros(self, bsz):
        return (torch.zeros(self.nlayers, bsz, self.hdim, device=self.device),
                torch.zeros(self.nlayers, bsz, self.hdim, device=self.device))
    def forward(self, inputs):
        bsz, seqlen = inputs.size()
        embedded = F.dropout(self.embed(inputs), p=self.dropout)
        hiddens = self.init_hiddens_zeros(bsz)
        outputs, hiddens = self.rnn(embedded, hiddens)
        outputs = F.dropout(outputs, p=self.dropout)
        outputs = outputs.contiguous().view(bsz*seqlen, -1)
        outputs = self.outlayer(outputs)
        outputs = outputs.contiguous().view(bsz, seqlen, -1)

        return outputs

I’m using torchtext for data reading and mini batching; as follows:

src = trg = torchtext.data.Field(sequential=True, use_vocab=True, init_token=go_token, 
                 eos_token=eos_token, fix_length=None, dtype=torch.long, 
                 preprocessing=None, postprocessing=None, lower=True, 
                 tokenize=tokenize, include_lengths=True, batch_first=True, 
                 pad_token=pad_token, unk_token=unk_token, pad_first=False, truncate_first=False)

train_ds = torchtext.datasets.TranslationDataset(
                            path='data/ptb.train', exts=('.txt', '.txt'),
                            fields=(src, trg))

src.build_vocab(train_ds, max_size=None)

Then I define my model, optimizer and learning rate scheduler as follows:

vsz = len(src.vocab)
eos_idx = src.vocab.stoi[eos_token]
go_idx = src.vocab.stoi[go_token]
unk_idx = src.vocab.stoi[unk_token]
pad_idx = src.vocab.stoi[pad_token]

nlayers = 2
dropout = .5
hdim = 200
edim = 200
tie_weights = True
device = torch.device("cuda:0")
lm = LM(nlayers, dropout, edim, vsz, hdim, go_idx, pad_idx, tie_weights, device)

optimizer = torch.optim.SGD(lm.parameters(), lr=lr)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                           factor=0.8, patience=1, 
                                                           verbose=False, threshold=0.0001, 
                                                           threshold_mode='rel', cooldown=0, 
                                                           min_lr=0, eps=1e-08)

Finally here’s my training loop:

bsz = 32
train_iter = torchtext.data.BucketIterator(dataset=train_ds, batch_size=bsz,
                                            sort=True, sort_within_batch=True,
                                            sort_key=lambda x: len(x.src), device=device)
clip = .25
nepochs = 100

for epoch in range(nepochs):
    ep_st = time.time()
    epoch_loss = 0
    for batch in train_iter:
        src_b, lengths = batch.src
        bsz, seqlen = src_b.size()
        seqlen -= 1
        logits = lm(src_b[:, :-1])
        loss = F.cross_entropy(logits.view(bsz*seqlen, -1), src_b[:, 1:].contiguous().view(-1), ignore_index=pad_idx)
        clip_grad_norm_(lm.parameters(), clip)
        # track loss
        epoch_loss += float(loss.item())
        #end of epoch
        if train_iter.epoch > epoch:
            ep_et = time.time()
            print("End of epoch {} | time elapsed {:5.3f} | LM loss {:5.3f} | LR {}".
                  format(epoch, (ep_et-ep_st)/60, epoch_loss, optimizer.param_groups[0]['lr'])
            outwords = F.softmax(logits, dim=-1)
            values, indices = torch.topk(outwords, 1)
            sentences = [' '.join(src.vocab.itos[idx] for idx in sent[:lengths[i]]) for i, sent in enumerate(indices.squeeze().cpu().numpy().astype(int))]
            sentences = [' '.join(src.vocab.itos[idx] for idx in sent[:lengths[i]]) for i, sent in enumerate(src_b.squeeze().cpu().numpy().astype(int))]

Here are my imports:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
import torchtext
import numpy as np
import time

Any ideas why isn’t it learning? I appreciate your help.

If (h_0, c_0) is not provided, both h_0 and c_0 default to zero.
So, the init_hiddens_seros is unnecessary.

U can try lr = 0.01.

And, can u give the evidence of but it’s not working ?