RNN many to many do not converge

Dear PyTorch experts,

I am trying to build an lstm model for multivariate regression, but the model never converge:

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    
    def forward(self, x, h0, c0):
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        # Decode the hidden state of the last time step
        out = self.fc(out[:,-1,:])
        #out = F.relu(out)
        return out
    
in_features   = 3
hidden_size   = 128
nb_rnn_layers = 1
nb_classes    = 3

net = RNN(in_features, hidden_size, nb_rnn_layers, nb_classes).to(device)
print (net)

and this is an example of the input, where it has 48 time steps and the output is 3 variables

(tensor([[-45.2884,  11.8089, 294.8820],
         [-45.4233,  11.8189, 295.8110],
         [-45.9532,  11.8584, 299.4550],
         [-46.0674,  11.8667, 300.2410],
         [-46.6647,  11.9099, 304.3450],
         [-46.7999,  11.9195, 305.2740],
         [-47.3948,  11.9620, 309.3650],
         [-47.5298,  11.9717, 310.2930],
         [-49.9462,  12.1501, 326.9390],
         [-50.0599,  12.1581, 327.7240],
         [-50.5900,  12.1942, 331.3760],
         [-50.7246,  12.2029, 332.3040],
         [-51.3173,  12.2397, 336.4020],
         [-51.4513,  12.2479, 337.3300],
         [-52.0412,  12.2824, 341.4230],
         [-52.1542,  12.2888, 342.2090],
         [-58.4874,  12.5248, 393.9820],
         [-58.5616,  12.5250, 394.8920],
         [-59.0036,  12.5226, 400.9410],
         [-59.0617,  12.5216, 401.8450],
         [-59.6704,  12.4927, 415.4860],
         [-59.6920,  12.4901, 416.3720],
         [-59.7806,  12.4674, 422.9640],
         [-59.7824,  12.4639, 423.8400],
         [-59.5155,  12.4011, 437.4980],
         [-59.4805,  12.3966, 438.3500],
         [-59.1347,  12.3609, 444.9350],
         [-59.0766,  12.3565, 445.8370],
         [-57.8999,  12.2859, 459.5360],
         [-57.8125,  12.2814, 460.3470],
         [-57.0362,  12.2443, 466.9300],
         [-56.9168,  12.2389, 467.8540],
         [-11.5856,  10.5807, 606.8630],
         [-11.1563,  10.5598, 607.7730],
         [ -9.4675,  10.4768, 611.3410],
         [ -9.0352,  10.4551, 612.2510],
         [ -7.0042,  10.3541, 616.5160],
         [ -6.5702,  10.3325, 617.4250],
         [  0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000]]),
 tensor([[-0.1018,  0.1386,  0.9901]]))

and the training loop

# Train the model
total_step = len(dataloader)
kEpochs = 100
for epoch in range(kEpochs):
    for i_batch, (inputs, labels) in enumerate(dataloader):
        # Forward pass
        inputs = inputs.reshape(48, batch_size, 3).to(device)
        labels = labels.reshape(1, batch_size).to(device)
        h00 = torch.zeros(nb_rnn_layers, inputs.size(0), hidden_size).to(device)
        c00 = torch.zeros(nb_rnn_layers, inputs.size(0), hidden_size).to(device)
        outputs = net.forward(inputs, h00, c00)     
        
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' .format(epoch+1, kEpochs, i_batch+1, total_step, loss.item()))

Also I have used the same architecture for classification by putting sigmoid on top of the linear layer for a different task, and it worked perfectly.
What I dont understand is that the same model converges in Keras, can anyone help me in understanding where is the mistake?

Thank you very much