I have been building a Deep Learning model based on Attention. First I built a seq2seq model for the same, which works good, gives good results. Later I applied Attention mechanism, and the loss for it is not decreasing. I have tried various solutions available online, but none helped. Could some one help me find where could the issue be ?
The model architecture:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
class AttentionDecoder(nn.Module):
def __init__(self, nIn, hidden_size, output_size, dropout_p=0.1):
super(AttentionDecoder, self).__init__()
#layers and layers
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input)
batch_size = encoder_outputs.shape[1]
alpha = hidden + encoder_outputs
alpha = alpha.reshape(-1, alpha.shape[-1])
attn_weights = self.vat( torch.tanh(alpha))
attn_weights = attn_weights.view(-1, 1, batch_size).permute((2,1,0))
attn_weights = F.softmax(attn_weights, dim=2)
attn_applied = torch.matmul(attn_weights, encoder_outputs.permute((1, 0, 2)))
output = torch.cat((embedded, attn_applied.squeeze(1) ), 1)
output = self.attn_combine(output).unsqueeze(0)
output, hidden = self.gru(output, hidden)
output = self.out(output)
output = F.softmax(output, dim=1)
return output, hidden, attn_weights
def initHidden(self, batch_size):
result = Variable(torch.zeros(1, batch_size, self.hidden_size))
return result
class Decoder(nn.Module):
def __init__(self, nIn, nHidden, nOut):
super(Decoder, self).__init__()
# LSTM layers defined
def forward(self, input):
recurrent, _ = self.rnn(input)
output = self.embedding(t_rec)
return output
class Encoder(nn.Module):
def __init__(self, cnnOutSize, nc, nclass, nh):
super(Encoder, self).__init__()
# CNN and RNN layers defined
def forward(self, input):
conv = self.cnn(input)
encoder_outputs = self.rnn(conv)
return encoder_outputs
The train script:
from __future__ import print_function
from builtins import range
import json
import character_set
import sys
import hw_dataset
from hw_dataset import HwDataset
import attention_model_from_scratch_modded as model
import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from torch.autograd import Variable
from warpctc_pytorch import CTCLoss
import error_rates
import string_utils
config_path = sys.argv[1]
with open(config_path) as f:
config = json.load(f)
idx_to_char, char_to_idx = character_set.load_char_set(config['character_set_path'])
NUM_CHANNELS = 3
NUM_HIDDEN_LAYERS = 1024
NUM_OUTPUTS = len(idx_to_char)+1
BATCH_SIZE = 8
NUM_WORKERS = 1
SOS_LABEL = 1
EOS_LABEL = 2
device = ""
if torch.cuda.is_available():
device = 'cuda'
dtype = torch.cuda.FloatTensor
print("Using GPU")
else:
device = 'cpu'
dtype = torch.FloatTensor
print("No GPU detected")
train_dataset = HwDataset(config['training_set_path'], char_to_idx, img_height=config['network']['input_height'], root_path=config['image_root_directory'], augmentation=True)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=hw_dataset.collate)
test_dataset = HwDataset(config['validation_set_path'], char_to_idx, img_height=config['network']['input_height'], root_path=config['image_root_directory'])
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=hw_dataset.collate)
print('Size of Train dataset: ', len(train_dataset))
print('Size of Test dataset: ', len(test_dataset))
criterion = CTCLoss()
encoder = model.Encoder(config['network']['cnn_out_size'], NUM_CHANNELS, NUM_OUTPUTS, NUM_HIDDEN_LAYERS)
decoder = model.AttentionDecoder(config['network']['cnn_out_size'], NUM_HIDDEN_LAYERS, NUM_OUTPUTS)
encoder.to(device)
decoder.to(device)
criterion.to(device)
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=config['network']['learning_rate'])
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=config['network']['learning_rate'])
def cer_loss_func(x, out):
#calculates Character Error rate
def validation(encoder, decoder, criterion):
for e, d in zip(encoder.parameters(), decoder.parameters()):
e.requires_grad = False
d.requires_grad = False
encoder.eval()
decoder.eval()
for x in test_dataloader:
#processing
encoder_output = encoder(line_imgs)
decoder_input = text[0].cuda()
decoder_hidden = decoder.initHidden(BATCH_SIZE).cuda()
decoded_labels = []
for di in range(1, text.shape[0]):
decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_output)
preds_size = Variable(torch.IntTensor([decoder_output.size(0)] * decoder_output.size(1)))
decoder_input = text[di].cuda()
sum_loss, steps = cer_loss_func(x, decoded_labels)
return sum_loss, steps
def train(encoder, decoder, criterion, encoder_optimizer, decoder_optimizer):
for e, d in zip(encoder.parameters(), decoder.parameters()):
e.requires_grad = True
d.requires_grad = True
encoder.train()
decoder.train()
for x in train_dataloader:
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
#processing
encoder_output = encoder(input)
decoder_input = torch.tensor([1]*BATCH_SIZE).cuda()
decoder_hidden = decoder.initHidden(BATCH_SIZE).cuda()
loss = 0.0
decoded_labels = []
for di in range(1, text.shape[0]):
decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_output)
preds_size = Variable(torch.IntTensor([decoder_output.size(0)] * decoder_output.size(1)))
loss += criterion(decoder_output, text[di], preds_size, label_lengths)
decoder_input = text[di].cuda()
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()
sum_loss, steps = cer_loss_func(x, decoded_labels)
return sum_loss, steps
def main():
lowest_loss = float('inf')
for epoch in range(1000):
print('\nStarting for epoch:', (epoch+1))
sum_loss, steps = train(encoder, decoder, criterion, encoder_optimizer, decoder_optimizer)
print("Training CER", sum_loss / steps)
sum_loss, steps = validation(encoder, decoder, criterion)
print("Testing CER", sum_loss / steps)
if lowest_loss > sum_loss/steps:
lowest_loss = sum_loss/steps
print("Saving Best")
dirname = os.path.dirname(config['model_save_path'])
if len(dirname) > 0 and not os.path.exists(dirname):
os.makedirs(dirname)
torch.save(encoder.state_dict(), 'encoder.pt')
torch.save(decoder.state_dict(), 'decoder.pt')
if __name__ == "__main__":
main()
I am using the CTCLoss() function, and also CER rate, as I am generating string output.
Can someone help me figure out why could be the errors or issues due to which the loss isn’t decreasing ?