Hello all
I was trying to build a model using Attention layer. My model without attention perfectly overfits on a small dataset, but the one with attention doesn’t. Could someone help me how can I fix it, or whether or not I am doing it correctly?
I am using the same encoder architecture for both the models, the only difference is of the decoder.
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
class AttentionDecoder(nn.Module):
def __init__(self, nh=256, nclass=13, dropout_p=0.1):
super(AttentionDecoder, self).__init__()
self.hidden_size = nh
self.output_size = nclass
self.dropout_p = dropout_p
self.embedding = nn.Embedding(self.output_size, self.hidden_size)
self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
self.vat = nn.Linear(self.hidden_size, 1)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input)
embedded = self.dropout(embedded)
# test
batch_size = encoder_outputs.shape[1]
alpha = hidden + encoder_outputs
alpha = alpha.reshape(-1, alpha.shape[-1])
attn_weights = self.vat( torch.tanh(alpha))
attn_weights = attn_weights.view(-1, 1, batch_size).permute((2,1,0))
attn_weights = F.softmax(attn_weights, dim=2)
attn_applied = torch.matmul(attn_weights,encoder_outputs.permute((1, 0, 2)))
# output = torch.cat((embedded, attn_applied ), -1)
output = torch.cat((embedded, attn_applied.squeeze(1) ), -1)
output = self.attn_combine(output).unsqueeze(0)
output = F.relu(output)
output = output.squeeze(2)
output, hidden = self.gru(output, hidden)
output = output.unsqueeze(0)
output = F.log_softmax(self.out(output[0]), dim=1)
return output, hidden, attn_weights
def initHidden(self, batch_size):
result = Variable(torch.zeros(1, batch_size, self.hidden_size))
return result
class Encoder(nn.Module):
def __init__(self, cnnOutSize, nc, nclass, nh, n_rnn=2, leakyRelu=False):
super(Encoder, self).__init__()
ks = [3, 3, 3, 3, 3, 3, 2]
ps = [1, 1, 1, 1, 1, 1, 0]
ss = [1, 1, 1, 1, 1, 1, 1]
nm = [64, 128, 256, 256, 512, 512, 512]
cnn = nn.Sequential()
def convRelu(i, batchNormalization=False):
nIn = nc if i == 0 else nm[i - 1]
nOut = nm[i]
cnn.add_module('conv{0}'.format(i),
nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
if batchNormalization:
cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut))
if leakyRelu:
cnn.add_module('relu{0}'.format(i),
nn.LeakyReLU(0.2, inplace=True))
else:
cnn.add_module('relu{0}'.format(i), nn.ReLU(True))
convRelu(0)
cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2)) # 64x16x64
convRelu(1)
cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2)) # 128x8x32
convRelu(2, True)
convRelu(3)
cnn.add_module('pooling{0}'.format(2),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 256x4x16
convRelu(4, True)
convRelu(5)
cnn.add_module('pooling{0}'.format(3),
nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 512x2x16
convRelu(6, True) # 512x1x16
self.cnn = cnn
self.softmax = nn.LogSoftmax()
def forward(self, input):
# print(input.shape)
conv = self.cnn(input)
# print('After Encoder Shape: ', conv.shape)
b, c, h, w = conv.size()
conv = conv.reshape(b, -1, w)
# print(conv.shape)
conv = conv.permute(2, 0, 1) # [w, b, c]
# print(conv.shape)
return conv
class Decoder(nn.Module):
def __init__(self, nIn, nHidden, nOut):
super(Decoder, self).__init__()
self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
self.embedding = nn.Linear(nHidden * 2, nOut)
def forward(self, input):
recurrent, _ = self.rnn(input)
T, b, h = recurrent.size()
t_rec = recurrent.view(T * b, h)
output = self.embedding(t_rec) # [T * b, nOut]
output = output.view(T, b, -1)
return output
class Model(nn.Module):
def __init__(self, cnnOutSize, nc, nclass, nh, n_rnn=2, leakyRelu=False):
super(Model, self).__init__()
self.nh = nh
self.encoder = Encoder(cnnOutSize, nc, nclass, nh)
# self.decoder = Decoder(cnnOutSize, nh, nclass)
self.attentionDecoder = AttentionDecoder(nh, nclass, dropout_p=0.1)
def forward(self, x):
conv_output = self.encoder(x)
# print(conv_output.shape)
first_word = torch.tensor([0]*conv_output.shape[1]).type(torch.LongTensor).cuda()
decoder_output, decoder_hidden, decoder_attention = self.attentionDecoder(first_word, self.attentionDecoder.initHidden(x.shape[0]).cuda(), conv_output)
return decoder_output
# rnn_output = self.decoder(conv_output)
# return rnn_output
def create_model(config):
model = Model(config['cnn_out_size'], config['num_of_channels'], config['num_of_outputs'], 1024)
return model