Implement a Keras model using Pytorch doesn't learn

I was a Keras user, now begin to migrate my work to Pytorch. However, I noticed that for some models that learn normally in keras, don’t learn (loss doesn’t decrease) when I implement and train them using Pytorch. I feel that there might be some bug in my code? (I’m using exact the same model structure, hyper-param, optimizer, and even initializer…)

For example, in this basic LSTM network, the keras model:

def Word2VecModel(embedding_matrix, num_words, embedding_dim, seq_length, 
                  num_hidden_lstm,
                  output_dim,
                  dropout_rate):
    print("Creating text model...")
    model = Sequential()
    
    model.add(Embedding(num_words, embedding_dim, 
        weights=[embedding_matrix], input_length=seq_length, trainable=False))
    
    model.add(LSTM(units=num_hidden_lstm, return_sequences=True, input_shape=(seq_length, embedding_dim)))
    model.add(Dropout(dropout_rate))
    
    model.add(LSTM(units=num_hidden_lstm, return_sequences=False))
    model.add(Dropout(dropout_rate))
    
    model.add(Dense(output_dim, activation='relu'))
#     model.add(Dense(output_dim, activation='tanh'))

    return model

lstm_model = Word2VecModel(embedding_matrix, num_words, embedding_dim, seq_length, 
                  num_hidden_lstm,
                  output_dim,
                  dropout_rate)

print ("Creating Blind LSTM model")
y_input = Input(shape=(seq_length,), name='y_input')
lstm_result = lstm_model(y_input)

d1 = Dense(128, activation='tanh')(lstm_result)
d2 = Dense(num_classes, activation='softmax')(d1)

new_model = Model(inputs=[ y_input], outputs=d2)

epochs = epochs
batch_size= batch_size

new_model.compile(optimizer='adam', 
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])

The PyTorch model:

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        
        self.embed = nn.Embedding(400001, 300)
        # Embedding layer: loading weights
        embedding_matrix = ebd.load()
        print(embedding_matrix.shape)
        self.embed.weight.data = torch.Tensor(embedding_matrix)
        self.embed.weight.requires_grad = False
        
        self.lstm1 = nn.LSTM(300, 128, 1, batch_first=True)
        self.lstm2 = nn.LSTM(128, 128, 1, batch_first=True)
        
        
        self.fc1 = nn.Linear(128, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 28)


    def forward(self, question):
        batch_size = question.size()[0]
        
        embed = self.embed(question)
#         print(embed.shape)
        lstm_out, _ = self.lstm1(embed)
        lstm_out = F.dropout(lstm_out, 0.5)
#         print(lstm_out.shape)
        lstm_out, _ = self.lstm2(lstm_out)
        lstm_out = F.dropout(lstm_out, 0.5)
#         print(lstm_out.shape)
        
        x = lstm_out[:,-1]
#         print(x.shape)
        x = F.relu(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = self.fc3(x)
        return x


def train(epoch):
    training_set = My_Data2(split='val')
    train_set = DataLoader(
        training_set, batch_size=batch_size, num_workers=1
    )

    dataset = iter(train_set)
    pbar = tqdm(dataset)
    moving_loss = 0

    net.train(True)
    for iter_id, (question, q_len, answer) in enumerate(pbar):
        
        q_len = q_len.tolist()
        question = question.type(torch.LongTensor)
        
        question, answer = (
            question.to(device),
            answer.to(device),
        )

        net.zero_grad()
        output = net(question)
        loss = criterion(output, answer)
        loss.backward()
        optimizer.step()
        correct = output.detach().argmax(1) == answer
        correct = torch.tensor(correct, dtype=torch.float32).sum() / batch_size
        
        if moving_loss == 0:
            moving_loss = correct

        else:
            moving_loss = (moving_loss * iter_id + correct)/(iter_id+1)

    

        pbar.set_description(
            'Epoch: {}; Loss: {:.5f}; Current_Acc: {:.5f}; Total_Acc: {:.5f}'.format(
                epoch + 1, loss.item(), correct, moving_loss
            )
        )

batch_size = 64
n_epoch = 20
dim = 512
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=1e-4)

for epoch in range(10000):
    print('==========%d epoch =============='%(epoch))
    train(epoch)
    acc = valid(epoch) 

The keras model converges in 20 epoches with an (85% Train Acc, 65% Test Acc), while the PyTorch model loss fluctuate, and gives an Acc about 40%. (Since the training data is not balanced, even a random guess would give 40%)

Is there any problem in my implementation? Any obvious bugs that prevent the model from learning?
Thanks!

Problem identified, the data need to be shuffled in train loader.