Pytorch Attention model not training

I have been building a Deep Learning model based on Attention. First I built a seq2seq model for the same, which works good, gives good results. Later I applied Attention mechanism, and the loss for it is not decreasing. I have tried various solutions available online, but none helped. Could some one help me find where could the issue be ?

The model architecture:

import torch 
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

class AttentionDecoder(nn.Module):
    def __init__(self, nIn, hidden_size, output_size, dropout_p=0.1):
        super(AttentionDecoder, self).__init__()
        #layers and layers

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input)        

        batch_size = encoder_outputs.shape[1]
        alpha = hidden + encoder_outputs    
        alpha = alpha.reshape(-1, alpha.shape[-1])
        attn_weights = self.vat( torch.tanh(alpha))
        attn_weights = attn_weights.view(-1, 1, batch_size).permute((2,1,0))
        attn_weights = F.softmax(attn_weights, dim=2)
        attn_applied = torch.matmul(attn_weights, encoder_outputs.permute((1, 0, 2)))  
        output = torch.cat((embedded, attn_applied.squeeze(1) ), 1)  
        output = self.attn_combine(output).unsqueeze(0)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        output = F.softmax(output, dim=1)
        return output, hidden, attn_weights

    def initHidden(self, batch_size):
        result = Variable(torch.zeros(1, batch_size, self.hidden_size))
        return result
    
class Decoder(nn.Module):

    def __init__(self, nIn, nHidden, nOut):
        super(Decoder, self).__init__()
        # LSTM layers defined

    def forward(self, input):
        recurrent, _ = self.rnn(input)
        output = self.embedding(t_rec) 
        return output

class Encoder(nn.Module):
    def __init__(self, cnnOutSize, nc, nclass, nh):
        super(Encoder, self).__init__()
        # CNN and RNN layers defined
    def forward(self, input):
        conv = self.cnn(input)
        encoder_outputs = self.rnn(conv) 
        return encoder_outputs

The train script:

from __future__ import print_function
from builtins import range

import json
import character_set
import sys
import hw_dataset
from hw_dataset import HwDataset
import attention_model_from_scratch_modded as model
import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from torch.autograd import Variable
from warpctc_pytorch import CTCLoss
import error_rates
import string_utils

config_path = sys.argv[1]

with open(config_path) as f:
    config = json.load(f)
    

idx_to_char, char_to_idx = character_set.load_char_set(config['character_set_path'])

NUM_CHANNELS = 3
NUM_HIDDEN_LAYERS = 1024
NUM_OUTPUTS = len(idx_to_char)+1
BATCH_SIZE = 8
NUM_WORKERS = 1
SOS_LABEL = 1
EOS_LABEL = 2

device = ""

if torch.cuda.is_available():
    device = 'cuda'
    dtype = torch.cuda.FloatTensor
    print("Using GPU")
else:
    device = 'cpu'
    dtype = torch.FloatTensor
    print("No GPU detected")

train_dataset = HwDataset(config['training_set_path'], char_to_idx, img_height=config['network']['input_height'], root_path=config['image_root_directory'], augmentation=True)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=hw_dataset.collate)

test_dataset = HwDataset(config['validation_set_path'], char_to_idx, img_height=config['network']['input_height'], root_path=config['image_root_directory'])
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=hw_dataset.collate)

print('Size of Train dataset: ', len(train_dataset))
print('Size of Test dataset: ', len(test_dataset))


criterion = CTCLoss()
encoder = model.Encoder(config['network']['cnn_out_size'], NUM_CHANNELS, NUM_OUTPUTS, NUM_HIDDEN_LAYERS)
decoder = model.AttentionDecoder(config['network']['cnn_out_size'], NUM_HIDDEN_LAYERS, NUM_OUTPUTS)
encoder.to(device)
decoder.to(device)
criterion.to(device)


encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=config['network']['learning_rate'])
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=config['network']['learning_rate'])


def cer_loss_func(x, out):
    #calculates Character Error rate

def validation(encoder, decoder, criterion):
    for e, d in zip(encoder.parameters(), decoder.parameters()):
        e.requires_grad = False
        d.requires_grad = False
    encoder.eval()
    decoder.eval()
    for x in test_dataloader:
        #processing
        encoder_output = encoder(line_imgs)
        decoder_input = text[0].cuda()
        decoder_hidden = decoder.initHidden(BATCH_SIZE).cuda()
        decoded_labels = []
        for di in range(1, text.shape[0]):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_output)
            preds_size = Variable(torch.IntTensor([decoder_output.size(0)] * decoder_output.size(1)))
            decoder_input = text[di].cuda()
        sum_loss, steps = cer_loss_func(x, decoded_labels)
    return sum_loss, steps  
        
def train(encoder, decoder, criterion, encoder_optimizer, decoder_optimizer):
    for e, d in zip(encoder.parameters(), decoder.parameters()):
        e.requires_grad = True
        d.requires_grad = True
    encoder.train()
    decoder.train()
    for x in train_dataloader:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        #processing
        encoder_output = encoder(input)
        decoder_input = torch.tensor([1]*BATCH_SIZE).cuda()
        decoder_hidden = decoder.initHidden(BATCH_SIZE).cuda()
        loss = 0.0
        decoded_labels = []
        for di in range(1, text.shape[0]):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_output)
            preds_size = Variable(torch.IntTensor([decoder_output.size(0)] * decoder_output.size(1)))
            loss += criterion(decoder_output, text[di], preds_size, label_lengths)
            decoder_input = text[di].cuda()
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        sum_loss, steps = cer_loss_func(x, decoded_labels)
    return sum_loss, steps
        
    
def main():
    lowest_loss = float('inf')
    for epoch in range(1000):
        print('\nStarting for epoch:', (epoch+1))
        
        sum_loss, steps = train(encoder, decoder, criterion, encoder_optimizer, decoder_optimizer)
        print("Training CER", sum_loss / steps)
        sum_loss, steps = validation(encoder, decoder, criterion)
        print("Testing CER", sum_loss / steps)
        
        if lowest_loss > sum_loss/steps:
            lowest_loss = sum_loss/steps
            print("Saving Best")
            dirname = os.path.dirname(config['model_save_path'])
            if len(dirname) > 0 and not os.path.exists(dirname):
                os.makedirs(dirname)

            torch.save(encoder.state_dict(), 'encoder.pt')
            torch.save(decoder.state_dict(), 'decoder.pt')

if __name__ == "__main__":
    main()

I am using the CTCLoss() function, and also CER rate, as I am generating string output.

Can someone help me figure out why could be the errors or issues due to which the loss isn’t decreasing ?