Different training result obtained from training simple LSTM in Keras and Pytorch

Hello guys, I’m trying to implement my LSTM model from Keras to Pytorch, but the results in Pytorch seem really bad at the moment. The network is really simple as below.

model = Sequential()
    model.add(LSTM(10, input_length=shape[1], input_dim=shape[2]))
    # output shape: (1, 1)
    model.add(Dense(10,activation="tanh"))
    model.add(Dense(10,activation="tanh"))
    model.add(Dense(10,activation="tanh"))
    model.add(Dense(10,activation="tanh"))
    model.add(Dense(1,activation="linear"))
    model.compile(loss="mse", optimizer="adam")
    model.summary()

And I migrate it to the Pytorch framework,

class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim,bilstm=False):

        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.isBi = bilstm
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True,bidirectional=bilstm).double()
        # for name, param in self.lstm.named_parameters():
        #     if name.startswith("weight"):
        #         nn.init.orthogonal_(param)
        #     else:
        #         pass
       
        self.fc1 = nn.Sequential(nn.Linear(hidden_dim, 10).double(),nn.Tanh())
        self.final_layer1 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
        self.final_layer2 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
        self.final_layer3 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
        self.final_layer4 = nn.Sequential(nn.Linear(10,output_dim).double())
def forward(self, x):
        out, (hn, cn) = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc1(out) 
        out = self.final_layer1(out)
        out = self.final_layer2(out)
        out = self.final_layer3(out)
        out = self.final_layer4(out)
        return out

The result is really bad. I was wondering if the initializing methods/activation functions used in Keras is different from the one I used in Pytorch(Keras seems to be using hard_sigmoid where Pytorch uses sigmoid?).

Would be really appreciate if somebody could help me with this problem!