Need suggestions for the implemented pointer generator networks

Hi! I was trying to implement pointer generator networks for text summarization
: https://arxiv.org/abs/1704.04368
In order to implement this I built the network on top of the attention model which was provided in the PyTorch Tutorials . The dataset used was just first 30k pairs of this : https://www.kaggle.com/sunnysai12345/news-summary

Here is my Encoder model :


class Encoder(nn.Module):
  def __init__(self,input_vocab_size,hidden_size):
    super(Encoder,self).__init__()
    self.hidden_size = hidden_size
    self.input_vocab_size = input_vocab_size
    self.embedding_layer = nn.Embedding(self.input_vocab_size,embedding_dim = self.hidden_size)
    self.gru_layer = nn.GRU(input_size = self.hidden_size,hidden_size = self.hidden_size)

  def forward(self,input,prev_hidden_state):
    embedded_outputs = self.embedding_layer(input).view(1,1,-1)  #sequence_length X batch_size X inputsize   --> here input size is mostly the word embedding matrix or one hot
    output,prev_hidden_state = self.gru_layer(embedded_outputs,prev_hidden_state)  #output is batch_size times hidden_size
    return output,prev_hidden_state

  def init_hidden(self):
    return torch.zeros(1,1,self.hidden_size,device=device)

And here is my Attention Decoder:

class AttentionDecoder(nn.Module):
  def __init__(self,output_vocab_size,hidden_size,max_length_encoder,dropout_value,num_layers=1):
      super(AttentionDecoder,self).__init__()
      self.hidden_size = hidden_size
      self.num_layers = num_layers
      self.output_vocab_size = output_vocab_size
      self.dropout_p = dropout_value
      self.max_length_encoder = max_length_encoder
      self.embedding_layer = nn.Embedding(self.output_vocab_size,self.hidden_size)
      self.attention_layer = nn.Linear(self.hidden_size*2,self.max_length_encoder)
      self.attention_combine = nn.Linear(self.hidden_size*2,self.hidden_size)
      # self.Wsx = nn.Linear(self.hidden_size*2,1)
      # self.Wcontext = nn.Linear(self.hidden_size,1)
      self.Wsxe = nn.Linear(self.hidden_size*3,1)
      self.gru_layer = nn.GRU(self.hidden_size,self.hidden_size)
      self.output_layer = nn.Linear(self.hidden_size,self.output_vocab_size)
      self.dropout_layer = nn.Dropout(self.dropout_p)    

  def forward(self,input_,prev_hidden_state,encoder_output):
      embedding_outputs = self.embedding_layer(input_).view(1,1,-1)
      embeddings_dropout = self.dropout_layer(embedding_outputs)
      attention_layer_output = self.attention_layer(torch.cat((embeddings_dropout[0],prev_hidden_state[0]),1))
      attention_weights = nn.functional.softmax(attention_layer_output,dim=1)
      attention_applied = torch.bmm(attention_weights.unsqueeze(0),encoder_output.unsqueeze(0))
      concatenate_vector = torch.cat((embeddings_dropout[0],prev_hidden_state[0],attention_applied[0]),1)
      # concat_attention = torch.cat((),1)
      concatenate_vector_outputs = self.Wsxe(concatenate_vector) 
      # context_vector_output = self.Wcontext(attention_applied)
      pgen = nn.Sigmoid()
      pgen = pgen(concatenate_vector_outputs)
      attention_combine_logits = self.attention_combine(torch.cat((embeddings_dropout[0],attention_applied[0]),1)).unsqueeze(0)  #since gru requires a batch dimension
      attention_combine_relu = nn.functional.relu(attention_combine_logits)
      output,hidden = self.gru_layer(attention_combine_relu,prev_hidden_state)
      output_logits = self.output_layer(output)
      output_softmax = nn.functional.log_softmax(output_logits[0],dim=1)
      # print(output_softmax)
      return output_softmax,hidden,attention_weights,pgen

  def init_hidden(self):
    return torch.zeros(1,1,self.hidden_size,device=device)

The part of pointing is done in training part down below :

teacher_forcing_ratio = 0.5
def train(encoder,decoder,encoder_tens,decoder_tens,criterion,encoder_max_length,encoder_optimizer,decoder_optimizer):
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()
  
  encoder_hidden_state = encoder.init_hidden()
  input_length = len(encoder_tens)
  output_length = len(decoder_tens)
  
  encoder_outputs = torch.zeros(encoder_max_length,encoder.hidden_size,device=device)

  loss = 0
  for encoder_index in range(input_length):
    encoder_output,encoder_hidden_state = encoder(encoder_tens[encoder_index],encoder_hidden_state)
    encoder_outputs[encoder_index] = encoder_output[0,0]

  decoder_input = torch.tensor([[word2Index_dec['<START>']]],device=device)
  decoder_hidden_state = encoder_hidden_state
  use_teacher_forcing = True if random.random() > teacher_forcing_ratio else False

  encoder_indices  = encoder_tens.tolist()
  encoder_words = [ind2Word_enc[index[0]] for index in encoder_indices]

  common_words = []
  for word in encoder_words:
    if word in list(word2Index_dec.keys()):
      common_words.append(word)
  
  extended_vocab_i2w = {index:word for word,index in word2Index_dec.items()}
  last_index = len(extended_vocab_i2w.keys())
  for i in range(len(encoder_words)):
    if encoder_words[i] not in extended_vocab_i2w.values():
      extended_vocab_i2w[last_index] = encoder_words[i]
      last_index+=1

  extended_vocab_w2i = {word:index for index,word in extended_vocab_i2w.items()}
  extended_probab = [0]*len(extended_vocab_w2i.keys())
  common_words_indices = [extended_vocab_w2i[word] for word in common_words]  

  if use_teacher_forcing:
    for decoder_index in range(output_length):
      decoder_output,decoder_hidden_state,attention_weights,pgen = decoder(decoder_input,decoder_hidden_state,encoder_outputs)
      pgen = pgen.tolist()
      decoder_output_list = decoder_output.tolist()
      attention_mapping = {}
      attention_weights_list = attention_weights.tolist()[0]
      
      for i in range(len(encoder_indices)):
         encoder_word = encoder_words[i]
         extended_vocabulary_index = extended_vocab_w2i[encoder_word]  
         attention_mapping[extended_vocabulary_index] = attention_weights_list[i]

      pgen = pgen[0][0]
    
      for i in range(len(extended_probab)):
          if i < len(word2Index_dec.keys()):
              if i in common_words_indices:
                extended_probab[i] = pgen*np.exp(np.array(decoder_output_list[0][i])) + (1-pgen)*attention_mapping[i] 
              else:
                extended_probab[i] = pgen*np.exp(np.array(decoder_output_list[0][i]))            
          else:
            extended_probab[i] = (1-pgen)*attention_mapping[i]

      extended_outputs = torch.tensor(extended_probab,device=device)
      loss+=criterion(decoder_output,decoder_tens[decoder_index])
      decoder_input = decoder_tens[decoder_index]
  else:
      for decoder_index in range(output_length):
        decoder_output,decoder_hidden_state,attention_weights,pgen = decoder(decoder_input,decoder_hidden_state,encoder_outputs)
        pgen = pgen.tolist()
        decoder_output_list = decoder_output.tolist()
        attention_mapping = {}
        attention_weights_list = attention_weights.tolist()[0] 
        for i in range(len(encoder_indices)):
          encoder_word = encoder_words[i]
          extended_vocabulary_index = extended_vocab_w2i[encoder_word]       
          attention_mapping[extended_vocabulary_index] = attention_weights_list[i]

        pgen = pgen[0][0]
        for i in range(len(extended_probab)):
            if i < len(word2Index_dec.keys()):
                if i in common_words_indices:
                  extended_probab[i] = pgen*np.exp(np.array(decoder_output_list[0][i])) + (1-pgen)*attention_mapping[i] 
                else:
                  extended_probab[i] = pgen*np.exp(np.array(decoder_output_list[0][i]))            
            else:
              extended_probab[i] = (1-pgen)*attention_mapping[i]

        extended_probab[0] = 0
        extended_outputs = torch.tensor(extended_probab,device=device)
        loss+=criterion(decoder_output,decoder_tens[decoder_index])
        topv,topi = extended_outputs.topk(1)
        decoder_input = topi.squeeze().detach()
        if decoder_input >= len(word2Index_dec.keys()): 
          decoder_input = torch.tensor(0,dtype=torch.long,device=device)
        if decoder_input.item() == word2Index_dec['<END>']:
          break

  loss.backward()
  encoder_optimizer.step()
  decoder_optimizer.step()      
  return loss.item()/output_length

My vocabulary size is 21k for decoder and 47k for encoder.
When I train this model , my learning rate is 0.03 and the optimizer is Adam . It takes large amount of time to train . It starts with a loss of about 7.5 and and took me 7 hours on Google Colab GPU to reduce it to 5.5 . Can someone please help me out?