LSTM always has constant loss not learning

I am building a model to classify news (AG news dataset).

criterion = nn.CrossEntropyLoss()

My vocab size is ~33k. The loss and the accuracy is the same (around 1.3 and 26% accuracy) even after 20 epochs. Am I doing something wrong here?

class NewsClassifier(nn.Module):
  def __init__(self, vocab_weights = None, rnn_type = 'LSTM', vocab_size = len(vocab.vocab), n_classes = 4, embed_size = 300, rnn_units = 512, \
               n_layers = 2, bi_dir = True, rnn_drop = 0.0, padding_index = vocab['<unk>']):
    super().__init__()
    self.rnn_units = rnn_units
    self.n_classes = n_classes
    self.rnn_type = rnn_type
    if vocab_weights:
      self.embedding = nn.Embedding.from_pretrained(torch.as_tensor(vocab_weights))
    else:
      self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx = padding_index)
    if rnn_type == 'LSTM':
      self.rnn = nn.LSTM(embed_size, rnn_units, num_layers = n_layers, bidirectional = bi_dir, dropout = rnn_drop)
    elif rnn_type == 'GRU':
      self.rnn = nn.GRU(embed_size, rnn_units, num_layers = n_layers, bidirectional = bi_dir, dropout = rnn_drop)
    else:
      raise NotImplementError
    self.fc = nn.Linear(2 * rnn_units if bi_dir else rnn_units, self.n_classes)
  
  def forward(self, data, lens):
    x_embed = self.embedding(data) # (padded_lens, batch_size, embed_dim)
    x_packed = pack_padded_sequence(x_embed, lens.cpu(), enforce_sorted = False) #packing sequences and passing to RNN unit
    if self.rnn_type == 'LSTM':
      output_packed, (hidden,cell) = self.rnn(x_packed) #output is packed and cannot be fed to linear layers
    else:
      output_packed, hidden = self.rnn(x_packed) #For GRU there is only hidden state
    #Though n number of layers are stacked the output is always 1
    output_padded, _ = pad_packed_sequence(output_packed) #output is padded to be fed to linear layer (padded_lens, batch size, hidden_units)
    #Picking only the last output --> equivalent to reutrn_sequences = False in Keras
    out_reduced = torch.cat((output_padded[-1, :, : self.rnn_units], output_padded[-1, :, self.rnn_units :]), 1) 
    return self.fc(out_reduced)

model = NewsClassifier()
print(f'The total number of trainable parameters are : {sum(p.numel() for p in model.parameters() if p.requires_grad)}')
def train(model, iterator = trainDataloader, optimizer = optimizer, loss_fn = criterion):
  e_loss = e_acc = i = 0
  model.train()
  for inputs, leng, labels in iterator:
    inputs, leng, labels = inputs.to(device), leng.to(device), labels.to(device)
    optimizer.zero_grad()
    preds = model(inputs, leng).squeeze(1)
    loss = loss_fn(preds, labels.long())
    acc = accuracy(preds, labels)
    loss.backward()
    optimizer.step()
    e_loss += loss.item()
    e_acc += acc.item()
    i += 1
  return e_loss/i, e_acc/i

def predict(model, iterator = testDataloader, loss_fn = criterion):
  e_loss = e_acc = i = 0
  model.eval()
  with torch.no_grad():
    for inputs, leng, labels in iterator:
      inputs, leng, labels = inputs.to(device), leng.to(device), labels.to(device)
      preds = model(inputs, leng).squeeze(1)
      loss = loss_fn(preds, labels.long())
      acc = accuracy(preds, labels)
      e_loss += loss.item()
      e_acc += acc.item()
      i += 1
  return e_loss/i, e_acc/i

N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model)
    valid_loss, valid_acc = predict(model)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} / {N_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence
encode_news = lambda x : vocab(tokenize(x))

import numpy as np

def rnn_inputs(batch):
  batch = sorted(batch, key = lambda x : len(x[1]), reverse = True)
  review = [torch.tensor(encode_news(value[1])) for value in batch]
  leng = torch.tensor([len(value) for value in review])
  review = pad_sequence(review, padding_value = 0)
  target = torch.tensor([value[0]-1 for value in batch])
  return review, leng, target

train, test = AG_NEWS()
trainDataloader = DataLoader(train, batch_size = 32, shuffle = True, collate_fn = rnn_inputs)
testDataloader = DataLoader(test, batch_size = 128, shuffle = True, collate_fn = rnn_inputs)

Dataloader function for reference.