Model remembers only one example with Adadelta optimizer

I have an attention-based encoder-decoder architecture for doing speech recognition.
In order to make sure that everything is fine with the model, I tried to train it on couple of samples and test it on the same ones, to see if it will remember them. What I noticed is that it only remembers one sentence.
For example, if I use 4 samples and batch size of 4, the model will predict only one sentence and repeat it for all the other samples. This doesn’t make sense because the loss gets really low (below 0), so I would assume that it has learned all the samples.

First I thought that maybe during training the loss function gets only one example and not the whole batch but when I checked the size it seemed ok.
Then I thought that something might be wrong with the decoding phase, so I removed the beam search decoding and added greedy decoding strategy but the issue remained.

After a lot of debugging, I noticed that this happens only with Adadelta optimizer. If I use Adam for example, it manages to remember all the examples and get 0 word error rate.
So, I assume that either something is wrong with Adadelta, or I am not using it properly.

My model is defined as:

class Encoder(nn.Module):
    def __init__(self, input_tensor, hidden_size, num_layers, batch_size, device):
        super(Encoder, self).__init__()

        self.input_tensor = input_tensor
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.dropout = nn.Dropout(0.1)
        self.device = device

        self.lstm = nn.LSTM(self.input_tensor,

    def forward(self, input_tensor, input_feature_lengths):
        input_tensor = pack_padded_sequence(input_tensor, input_feature_lengths)
        output, hidden = self.lstm(input_tensor)
        output = pad_packed_sequence(output)[0]
        output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:]
        output = self.dropout(output)
        return output, hidden

# Attention decoder
class Decoder(nn.Module):
    def __init__(self, embedding_dim, hidden_size, output_size, num_layers, encoder_num_layers, batch_size, attention_type, device):
        super(Decoder, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.encoder_num_layers = encoder_num_layers
        self.batch_size = batch_size
        self.attention_type = attention_type
        self.dropout = nn.Dropout(0.1)
        self.device = device

        self.embedding = nn.Embedding(output_size, embedding_dim)

        self.lstm = nn.LSTM(self.embedding_dim,
        self.out = nn.Linear(self.hidden_size*2, self.output_size)
    def forward(self, input_tensor, decoder_hidden, encoder_output):
        embedding = self.embedding(input_tensor)
        embedding = embedding.permute(1, 0, 2)
	# --- multiplicative attention ---
        lstm_output, lstm_hidden = self.lstm(embedding, decoder_hidden)
        scores = self.dot_attention_score(encoder_output, lstm_output)
        scores = scores.permute(1, 0, 2)
        attn_weights = F.softmax(scores, dim=0)
        context = torch.bmm(attn_weights.permute(1, 2 ,0), encoder_output.permute(1, 0, 2))
        context = context.permute(1, 0, 2)
        output =, lstm_output), -1)
        output = self.out(output[0])
        # --- end multiplicative attention ---

        output = self.dropout(output)
        output = F.log_softmax(output, 1)

        return output, lstm_hidden

    def dot_attention_score(self, encoder_output, lstm_output):
        scores = torch.bmm(encoder_output.permute(1, 0, 2), lstm_output.permute(1, 2, 0))
        return scores

The way that I am initializing the optimizers is:

encoder_optimizer = optim.Adadelta(encoder.parameters())
decoder_optimizer = optim.Adadelta(decoder.parameters())

Has anyone experienced something similar with Adadelta, or could the issue be somewhere else in the code?