My model is not learning

My code is running fine but the model is not training regardless of the hyperparameters I use. I am not sure where it is going wrong. I am facing the same issue with a different model while using CTC Loss. Am I not understanding CTCLOss() correctly?

Here is my model:

class Bi_RNN(nn.Module):

    def __init__(self, input_dim1, input_dim2, hidden_dim, batch_size, output_dim=35, num_layers=2, rnn_type='LSTM'):
        super(Bi_RNN, self).__init__()
        self.input_dim1 = input_dim1
        self.input_dim2 = input_dim2
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers

        #Define the initial linear hidden layer
        self.init_linear1 = nn.Linear(self.input_dim1, self.input_dim1)
        self.init_linear2 = nn.Linear(self.input_dim2, self.input_dim2)

        # Define the LSTM layer
        self.lstm1 = eval('nn.' + rnn_type)(self.input_dim1, self.hidden_dim, self.num_layers, batch_first=True, bidirectional=True)
        self.lstm2 = eval('nn.' + rnn_type)(self.input_dim2, self.hidden_dim, self.num_layers, batch_first=True, bidirectional=True)
        self.lstm3 = eval('nn.' + rnn_type)(self.hidden_dim *2 *2, self.hidden_dim, self.num_layers, batch_first=True, bidirectional=True)
        
        # Define the output layer
        self.linear = nn.Linear(self.hidden_dim * 2 , output_dim)
        
        self.log_softmax = nn.LogSoftmax(dim=1)

    def init_hidden(self):
        # This is what we'll initialise our hidden state as
        return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))

    def forward(self, input1, input2):
        #Forward pass through initial hidden layer
        linear_input1 = self.init_linear1(input1)
        linear_input2 = self.init_linear2(input2)    

        # Forward pass through LSTM layer
        # shape of lstm_out: [batch_size, input_size ,hidden_dim]
        # shape of self.hidden: (a, b), where a and b both
        # have shape (batch_size, num_layers, hidden_dim).
        lstm_out1, self.hidden1 = self.lstm1(linear_input1)
        lstm_out2, self.hidden2 = self.lstm2(linear_input2)

        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        lstm_out3 = torch.cat((lstm_out1, lstm_out2), 2)
        lstm_out, self.hidden3 = self.lstm3(lstm_out3)
        
        y_pred = self.log_softmax(self.linear(lstm_out))
        return y_pred

I am using Adam Optimizer and CTC Loss

model = Bi_RNN(input_dim1, input_dim2, hidden_dim, num_layers, num_classes).to(device)

criterion = nn.CTCLoss(blank=0, reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

This is my training loop

for e in tqdm(range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()
    for X1_train_batch, X2_train_batch, y_train_batch, X_train_lens, y_train_lens in train_loader:
        X1_train_batch, X2_train_batch, y_train_batch, X_train_lens, y_train_lens = X1_train_batch.to(device), X2_train_batch.to(device), y_train_batch.to(device), X_train_lens.to(device), y_train_lens.to(device)
        
        y_train_pred = model(X1_train_batch, X2_train_batch)
        y_train_batch = torch.squeeze(y_train_batch)
    
        T = MAX_SEQ_LEN
        N = BATCH_SIZE
        C = num_classes

        pred_len = torch.full((N,), T, dtype=torch.long) 
        y_train_pred_trans = y_train_pred.permute(1, 0, 2)
        
        train_loss = criterion(y_train_pred_trans, y_train_batch, pred_len, y_train_lens)
        train_acc = multi_acc(y_train_pred, y_train_batch, y_train_lens)
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()