Attention Model not overfitting on a small dataset

Hello all
I was trying to build a model using Attention layer. My model without attention perfectly overfits on a small dataset, but the one with attention doesn’t. Could someone help me how can I fix it, or whether or not I am doing it correctly?
I am using the same encoder architecture for both the models, the only difference is of the decoder.

import torch 
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

class AttentionDecoder(nn.Module):
    def __init__(self, nh=256, nclass=13, dropout_p=0.1):
        super(AttentionDecoder, self).__init__()
        self.hidden_size = nh
        self.output_size = nclass
        self.dropout_p = dropout_p

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

        self.vat = nn.Linear(self.hidden_size, 1)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)

        # test
        batch_size = encoder_outputs.shape[1]
        alpha = hidden + encoder_outputs  
        alpha = alpha.reshape(-1, alpha.shape[-1])
        attn_weights = self.vat( torch.tanh(alpha))      
        attn_weights = attn_weights.view(-1, 1, batch_size).permute((2,1,0))
        attn_weights = F.softmax(attn_weights, dim=2)

        attn_applied = torch.matmul(attn_weights,encoder_outputs.permute((1, 0, 2)))     

#         output = torch.cat((embedded, attn_applied ), -1)
        output = torch.cat((embedded, attn_applied.squeeze(1) ), -1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output = output.squeeze(2)
        output, hidden = self.gru(output, hidden)
        output = output.unsqueeze(0)

        output = F.log_softmax(self.out(output[0]), dim=1)   
        return output, hidden, attn_weights

    def initHidden(self, batch_size):
        result = Variable(torch.zeros(1, batch_size, self.hidden_size))
        return result


class Encoder(nn.Module):
    def __init__(self, cnnOutSize, nc, nclass, nh, n_rnn=2, leakyRelu=False):
        super(Encoder, self).__init__()
        
        ks = [3, 3, 3, 3, 3, 3, 2]
        ps = [1, 1, 1, 1, 1, 1, 0]
        ss = [1, 1, 1, 1, 1, 1, 1]
        nm = [64, 128, 256, 256, 512, 512, 512]
        
        cnn = nn.Sequential()
        
        def convRelu(i, batchNormalization=False):
            nIn = nc if i == 0 else nm[i - 1]
            nOut = nm[i]
            cnn.add_module('conv{0}'.format(i),
                           nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
            if batchNormalization:
                cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
            if leakyRelu:
                cnn.add_module('relu{0}'.format(i),
                               nn.LeakyReLU(0.2, inplace=True))
            else:
                cnn.add_module('relu{0}'.format(i), nn.ReLU(True))
        convRelu(0)
        cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2))  # 64x16x64
        convRelu(1)
        cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2))  # 128x8x32
        convRelu(2, True)
        convRelu(3)
        cnn.add_module('pooling{0}'.format(2),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 256x4x16
        convRelu(4, True)
        convRelu(5)
        cnn.add_module('pooling{0}'.format(3),
                       nn.MaxPool2d((2, 2), (2, 1), (0, 1)))  # 512x2x16
        convRelu(6, True)  # 512x1x16

        self.cnn = cnn
        self.softmax = nn.LogSoftmax()

    def forward(self, input):
        # print(input.shape)
        conv = self.cnn(input)
        # print('After Encoder Shape: ', conv.shape)
        b, c, h, w = conv.size()
        conv = conv.reshape(b, -1, w)
        # print(conv.shape)
        conv = conv.permute(2, 0, 1)  # [w, b, c]
        # print(conv.shape)
        return conv


class Decoder(nn.Module):

    def __init__(self, nIn, nHidden, nOut):
        super(Decoder, self).__init__()
        self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
        self.embedding = nn.Linear(nHidden * 2, nOut)

    def forward(self, input):
        recurrent, _ = self.rnn(input)
        T, b, h = recurrent.size()
        t_rec = recurrent.view(T * b, h)
        output = self.embedding(t_rec)  # [T * b, nOut]
        output = output.view(T, b, -1)
        return output
    
    
class Model(nn.Module):
    def __init__(self, cnnOutSize, nc, nclass, nh, n_rnn=2, leakyRelu=False):
        super(Model, self).__init__()
        self.nh = nh
        self.encoder = Encoder(cnnOutSize, nc, nclass, nh)
#         self.decoder = Decoder(cnnOutSize, nh, nclass)
        self.attentionDecoder = AttentionDecoder(nh, nclass, dropout_p=0.1)  
        
    def forward(self, x):
        conv_output = self.encoder(x)
#         print(conv_output.shape)
        first_word = torch.tensor([0]*conv_output.shape[1]).type(torch.LongTensor).cuda()
        decoder_output, decoder_hidden, decoder_attention  = self.attentionDecoder(first_word, self.attentionDecoder.initHidden(x.shape[0]).cuda(), conv_output)
        return decoder_output
#         rnn_output = self.decoder(conv_output)
#         return rnn_output
        

def create_model(config):
    model = Model(config['cnn_out_size'], config['num_of_channels'], config['num_of_outputs'], 1024)
    return model

@ptrblck Could you help me out ?

I cannot spot anything obviously wrong, but would start by checking all shapes, as your current code is using some permutations and reshapes.
Try to add comments with the dimensions to all lines and make sure that the layers are getting the tensors in the expected shapes.

The layers are getting tensors in the expected shapes, I’ve checked that.