The model with CTC loss learn nothing

I tried to implement the model which learn to localize the action in a video.
the model is composed of 2 parts, features extraction part with CNN and sequence recognition part with RNN, and use CTC loss to predict class in each sequence as the code below.

class ResCNNEncoder(nn.Module):
    def __init__(self, CNN_embed_dim=128,fc_hidden = (128,128), drop_p=0.3):
        super(ResCNNEncoder, self).__init__()
        self.CNN_embed_dim = CNN_embed_dim
        self.drop_p = drop_p
        self.fc_hidden1, self.fc_hidden2 = fc_hidden

        resnet = models.resnet18(pretrained=True)
        resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        modules = list(resnet.children())[:-1] #delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        
        self.fc1 = nn.Linear(resnet.fc.in_features, self.fc_hidden1)
        self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)
        self.fc3 = nn.Linear(self.fc_hidden2, self.CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        
        for t in range(x_3d.size(1)):
            
            # ResNet CNN
            x = self.resnet(x_3d[:, t, :, :,:])  # ResNet
            x = F.relu(x)
            x = x.view(x.size(0), -1)             # flatten output of conv
                
            # FC layers
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = F.relu(self.fc3(x))
            x = F.dropout(x, p=self.drop_p, training=self.training)

            cnn_embed_seq.append(x)

        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
     
        return cnn_embed_seq

class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN=256, drop_p=0.3, num_classes=2):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.num_classes = num_classes

        self.LSTM_first = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=1,       
            batch_first=True,    
        )
        
        self.LSTM_mid = nn.LSTM(
            input_size=self.h_RNN,
            hidden_size=self.h_RNN,        
            num_layers=1,       
            batch_first=True,    
        )
        
        self.LSTM_last = nn.LSTM(
            input_size=self.h_RNN,
            hidden_size=self.num_classes,        
            num_layers=1,       
            batch_first=True,    
        )

    def forward(self, x):
        x = F.relu(self.LSTM_first(x)[0])
        x = F.relu(self.LSTM_mid(x)[0])
        x = F.relu(self.LSTM_mid(x)[0])
        x = F.relu(self.LSTM_mid(x)[0])
        x = F.relu(self.LSTM_mid(x)[0])
        x = F.relu(self.LSTM_last(x)[0])

        return x

my problem is after I train for a few iterations the prediction result become blank or only one class in all sequence. it seems the model can not adjust any weight and learn nothing.

the model’s input is a stack of video frames array has dimension (1, number of a video’s frame,1,128,128)
the output of the model has dimension (number of a video’s frame, 1, 40)

cnn_encoder = ResCNNEncoder(CNN_embed_dim=128, fc_hidden=(128,128), drop_p=0.3).to(device)
rnn_decoder = DecoderRNN(CNN_embed_dim=128,h_RNN=256, drop_p=0.3, num_classes=40).to(device)

params = list(cnn_encoder.parameters()) + list(rnn_decoder.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)
ctc_loss = nn.CTCLoss(blank=0, reduction='none')

for video_idex in range(0,len(X_train)) :
        X = load_video(data_path, X_train[video_idex], (128,128)) #this function is return stack array of video frames the dimension of X is (1, number of a video's frame,1,128,128)
        target = y_train[video_idex][0:X.shape[1]] #target sequences of loss function excluding blank(0)
        
        X = X/255
        
        X = torch.tensor(X)
        target = torch.tensor(target).view(1,-1)
        
        X = X.to(device=device, dtype=torch.float)
        target = target.to(device=device, dtype=torch.long)
        
        optimizer.zero_grad()
        
        output = rnn_decode(cnn_encode(X))
        output = output.view(output.shape[1],1,-1)
        
        input = output.log_softmax(2).requires_grad_()
        
        input_lengths = torch.full(size=(1,), fill_value=output.shape[0], dtype=torch.long)
        target_lengths = torch.full(size=(1,), fill_value=target.shape[1], dtype=torch.long)
        
        step_loss = ctc_loss(input, target, input_lengths, target_lengths)
        step_loss.backward()
        optimizer.step()

I tired to chang optimizer and decrese/incease learning rate but still the same problem.

Pannattee,
Best regards