Can‘t convergence as good as Keras or Tensorflow

I‘m a beginner of Pytorch ,and I try to build a lstm acoustic model, I used merlin’s frontend to prepare data,but my result is not as good as keras or tensorflow,here is my code,Did I make any mistakes in building the model and training?

class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_size):
        super(LSTM, self).__init__()
        self.fc1=nn.Linear(embedding_dim,hidden_dim)
        self.fc2=nn.Linear(hidden_dim,hidden_dim)
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,batch_first=True)
        self.hidden2out = nn.Linear(hidden_dim, output_size)
        self.dropout_layer = nn.Dropout(p=0.1)


    def init_hidden(self, batch_size):
        return (autograd.Variable(torch.randn(2, batch_size, self.hidden_dim)).cuda(),
                autograd.Variable(torch.randn(2, batch_size, self.hidden_dim)).cuda())


    def forward(self,input, lengths):
        self.hidden = self.init_hidden(batch_size)
        input1=torch.tanh(self.fc1(input))
        input2=torch.tanh(self.fc2(input1))
        packed_input = pack_padded_sequence(input2, lengths,batch_first=True)
        outputs, (ht, ct) = self.lstm(packed_input, self.hidden)
        opt,_=pad_packed_sequence(outputs,batch_first=True)
        outputs=self.hidden2out(opt)
        return outputs

model =LSTM(ins,ins,outs).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.002)
for epoch in range(25):  # again, normally you would NOT do 300 epochs, it is toy data
    L = 1
    overall_loss = 0

    for iteration in range(int(len(train_x.keys()) / batch_size) + 1):
        x_batch, y_batch, utt_length_batch = get_batch(train_x, train_y,keys_list,iteration,batch_size)
        if utt_length_batch == []:
            continue
        else:
            L += 1
        max_length_batch = max(utt_length_batch)
        x_batch = data_utils.transform_data_to_3d_matrix(x_batch, max_length=max_length_batch, shuffle_data=False)
        y_batch = data_utils.transform_data_to_3d_matrix(y_batch, max_length=max_length_batch, shuffle_data=False)
    #    for i in range(len(x_batch)):
    #        for s in range(len(x_batch[i])):
    #            temp_x_batch[s][i][:]=x_batch[i][s][:]
     #           temp_y_batch[s][i][:]=y_batch[i][s][:]
        inputs = torch.from_numpy(x_batch).float().to(device)
        tags = torch.from_numpy(y_batch).float().to(device)

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.

        # Step 3. Run our forward pass.
        #output,hidden = model(inputs,utt_length_batch)

        pred = model(torch.autograd.Variable(inputs), utt_length_batch)
        loss=criterion(pred,tags)
        loss.backward()
        optimizer.step()
        overall_loss += loss
    print(overall_loss/L)

Maybe there are some minor differences between your PyTorch and Keras/TF code.
Could you post the Keras code and also some dummy input and target tensors, i.e. data = torch.randn(...) so that we could compare the results and debug the code?