I have an attention-based encoder-decoder architecture for doing speech recognition.
In order to make sure that everything is fine with the model, I tried to train it on couple of samples and test it on the same ones, to see if it will remember them. What I noticed is that it only remembers one sentence.
For example, if I use 4 samples and batch size of 4, the model will predict only one sentence and repeat it for all the other samples. This doesn’t make sense because the loss gets really low (below 0), so I would assume that it has learned all the samples.
First I thought that maybe during training the loss function gets only one example and not the whole batch but when I checked the size it seemed ok.
Then I thought that something might be wrong with the decoding phase, so I removed the beam search decoding and added greedy decoding strategy but the issue remained.
After a lot of debugging, I noticed that this happens only with Adadelta optimizer. If I use Adam for example, it manages to remember all the examples and get 0 word error rate.
So, I assume that either something is wrong with Adadelta, or I am not using it properly.
My model is defined as:
class Encoder(nn.Module):
def __init__(self, input_tensor, hidden_size, num_layers, batch_size, device):
super(Encoder, self).__init__()
self.input_tensor = input_tensor
self.hidden_size = hidden_size
self.num_layers = num_layers
self.batch_size = batch_size
self.dropout = nn.Dropout(0.1)
self.device = device
self.lstm = nn.LSTM(self.input_tensor,
self.hidden_size,
num_layers=3,
bidirectional=True
)
def forward(self, input_tensor, input_feature_lengths):
input_tensor = pack_padded_sequence(input_tensor, input_feature_lengths)
output, hidden = self.lstm(input_tensor)
output = pad_packed_sequence(output)[0]
output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:]
output = self.dropout(output)
return output, hidden
# Attention decoder
class Decoder(nn.Module):
def __init__(self, embedding_dim, hidden_size, output_size, num_layers, encoder_num_layers, batch_size, attention_type, device):
super(Decoder, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.embedding_dim = embedding_dim
self.num_layers = num_layers
self.encoder_num_layers = encoder_num_layers
self.batch_size = batch_size
self.attention_type = attention_type
self.dropout = nn.Dropout(0.1)
self.device = device
self.embedding = nn.Embedding(output_size, embedding_dim)
self.lstm = nn.LSTM(self.embedding_dim,
self.hidden_size,
num_layers=self.num_layers,
bidirectional=False)
self.out = nn.Linear(self.hidden_size*2, self.output_size)
def forward(self, input_tensor, decoder_hidden, encoder_output):
embedding = self.embedding(input_tensor)
embedding = embedding.permute(1, 0, 2)
# --- multiplicative attention ---
lstm_output, lstm_hidden = self.lstm(embedding, decoder_hidden)
scores = self.dot_attention_score(encoder_output, lstm_output)
scores = scores.permute(1, 0, 2)
attn_weights = F.softmax(scores, dim=0)
context = torch.bmm(attn_weights.permute(1, 2 ,0), encoder_output.permute(1, 0, 2))
context = context.permute(1, 0, 2)
output = torch.cat((context, lstm_output), -1)
output = self.out(output[0])
# --- end multiplicative attention ---
output = self.dropout(output)
output = F.log_softmax(output, 1)
return output, lstm_hidden
def dot_attention_score(self, encoder_output, lstm_output):
scores = torch.bmm(encoder_output.permute(1, 0, 2), lstm_output.permute(1, 2, 0))
return scores
The way that I am initializing the optimizers is:
encoder_optimizer = optim.Adadelta(encoder.parameters())
decoder_optimizer = optim.Adadelta(decoder.parameters())
Has anyone experienced something similar with Adadelta, or could the issue be somewhere else in the code?