The same model produces worse results on pytorch than on tensorflow

Recently I reimplemented a model which I have ever written in tensorflow, however, although with the same hyper-parameters, the model implemented in pytorch is not as good as that on tensorflow(90% on pytorch, 92% on tensorflow).
The same phenomenon occurs when I reimplemented the code written in tensorflow by one of my friend, with the same hyper-parameters and model architechture.(95% on pytorch, 98% on tensorflow)
Hope anyone can give some advice, thanks.


Seems that someone else is facing some similar problem.

1 Like

If you provide code examples it would help to advise

Below is a graph-based parser implemented in pytorch, refer to Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations for detail.

class Parser(torch.nn.Module):
    def __init__(self,
        super(Parser, self).__init__()

        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.word_embedding = torch.nn.Embedding(word_vocab_size, word_emb_size)
        self.pos_embedding = torch.nn.Embedding(pos_vocab_size, pos_emb_size)

        if pre_trained is not None:
            pre_word_embedding = np.random.uniform(-0.01, 0.01,size=[word_vocab_size, word_emb_size])
            wordvec_file = open(pre_trained)
            wiki_words = {}
            for line in wordvec_file.readlines():
                split = line.split(' ')
                word = split[0]
                emb = split[1:-1]
                wiki_words[word] = emb
            for word in word_dict:
                if word in wiki_words:
                    pre_word_embedding[word_dict[word]] = \
                        np.array(list(map(float, wiki_words[word])))
  'Pre-trained embedding loaded.')

        self.gru = torch.nn.GRU(input_size=word_emb_size + pos_emb_size,
                                # dropout=0.5,

        self.score = torch.nn.Sequential(
            torch.nn.Linear(2 * 2 * hidden_size, 2 * hidden_size),
            torch.nn.Linear(2 * hidden_size, 1)
            # torch.nn.Linear(2 * hidden_size, hidden_size),
            # torch.nn.ReLU(),
            # torch.nn.Linear(hidden_size, 1),

    def forward(self, word_indices, postag_indices, seq_lens):
        # shape is [batch * max_length]
        word_indices = var(torch.LongTensor(word_indices))
        postag_indices = var(torch.LongTensor(postag_indices))
        batch_size = word_indices.size(0)
        word_embs = self.word_embedding(word_indices)
        postag_embs = self.pos_embedding(postag_indices)
        joint_embs =[word_embs, postag_embs], dim=2)
        joint_embs = joint_embs.t()
        joint_embs = pack_padded_sequence(joint_embs, seq_lens, batch_first=False)

        h0 = var(torch.zeros(self.num_layers * 2,
        gru_word, _ = self.gru(joint_embs, h0)

        gru_word = pad_packed_sequence(gru_word)[0]
        gru_word = gru_word.t()

        scores = []
        D = self.hidden_size * 2
        for b in range(batch_size):
            L = seq_lens[b]
            # [[1,2,3],[4,5,6]] -> [[[1,2,3],[4,5,6]], [[1,2,3],[4,5,6]]]
            expanded = gru_word[b][0:L].expand(L, L, D)
            # [[1,2,3],[1,2,3],[4,5,6],[4,5,6]]
            foo = torch.transpose(expanded, 0, 1).contiguous().view(-1, D)
            # [[1,2,3],[4,5,6],[1,2,3],[4,5,6]]
            bar = expanded.contiguous().view(-1, D)
            foobar =[foo, bar], dim=1)
            b_score = self.score(foobar)
            b_score = b_score.view(L, L)
            b_score = F.softmax(b_score)

        return scores

parser = gpu(Parser(word_vocab_size=len(word_dict),

opt = torch.optim.Adam(params=parser.parameters())

for epoch in range(MAX_EPOCH):

Below is the same model implemented by Tensorflow.

# hyper parameters
batch_size = 8
word_dimension = 300
pos_dimension = 40
gru_hidden_dimension = 200
score_hidden_dimension = 2 * gru_hidden_dimension

learning_rate = 0.001
beta = 1e-4

# initialize variables
train_set = Reader(train_path)
valid_set = Reader(valid_path)
test_set = Reader(test_path)

with tf.device("/gpu:0"):
    if is_use_embedding:
        generator = EmbeddingGenerator(train_path, valid_path, test_path)
        word_dict = generator.get_word_dict()
        _, pos_dict, arc_dict = train_set.get_dicts()
        word_embedding_generated = generator.get_word_embeddings(embedding_path, word_dimension)
        word_embedding = tf.Variable(name='word_embedding',
        word_dict, pos_dict, arc_dict = train_set.get_dicts()
        word_embedding = tf.Variable(name='word_embedding',
                                     initial_value=tf.random_uniform([len(word_dict), word_dimension], -0.1, 0.1),

    pos_embedding = tf.Variable(name='pos_embedding',
                                initial_value=tf.random_uniform([len(pos_dict), pos_dimension], -0.1, 0.1),

    # placeholders
    batch_max_length = tf.placeholder(name='batch_max_length', dtype=tf.int32, shape=[])
    length = tf.placeholder(name='length', dtype=tf.int32, shape=[batch_size])

    word_indices = tf.placeholder(name='words_indices', shape=[batch_size, None], dtype=tf.int32)
    pos_indices = tf.placeholder(name='pos_indices', shape=[batch_size, None], dtype=tf.int32)

    index_matrix = tf.placeholder(name='index_matrix', shape=[batch_size, None, None, 2], dtype=tf.int32)
    gold_score_matrix = tf.placeholder(name='gold_score_matrix', shape=[batch_size, None, None], dtype=tf.float32)

    # build the network
    words = tf.nn.embedding_lookup(name='words', params=word_embedding, ids=word_indices)
    pos = tf.nn.embedding_lookup(name='pos', params=pos_embedding, ids=pos_indices)

    x = tf.concat(name='words_c_pos', values=[words, pos], axis=2)

    gru_fw = rnn.GRUCell(num_units=gru_hidden_dimension)
    gru_bw = rnn.GRUCell(num_units=gru_hidden_dimension)

    init_fw = gru_fw.zero_state(batch_size=batch_size, dtype=tf.float32)
    init_bw = gru_bw.zero_state(batch_size=batch_size, dtype=tf.float32)

    with tf.variable_scope('Bi-GRU'):
        outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw,

    # a bit different from Zhang et al. 2016
    c = tf.concat(outputs, 2)

    loss = tf.get_variable(name='loss', dtype=tf.float32, shape=[])

    u = tf.get_variable(name='u',
                        shape=[4 * gru_hidden_dimension, 2 * gru_hidden_dimension],
                        initializer=tf.random_uniform_initializer(-0.1, 0.1))

    v = tf.get_variable(name='v',
                        shape=[2 * gru_hidden_dimension, 1],
                        initializer=tf.random_uniform_initializer(-0.1, 0.1))

    g_s = []

    for i in range(batch_size):

        actual_length = length[i]
        sentence_rep = c[i]
        actual_gold_score = gold_score_matrix[i]

        rep_matrix = tf.reshape(tensor=tf.nn.embedding_lookup(params=sentence_rep, ids=index_matrix[i]),
                                shape=[batch_max_length, batch_max_length, 4 * gru_hidden_dimension],

        h_score = tf.nn.tanh(tf.einsum('ijk,kl->ijl', rep_matrix, u))

        o_score = tf.squeeze(tf.einsum('ijk,kl->ijl', h_score, v), 2)

        o_cut = o_score[:actual_length, :actual_length]
        o_rm = o_cut[1:, :]
        gs_cut = actual_gold_score[:actual_length, :actual_length]
        gs_rm = gs_cut[1:, :]

        g = tf.clip_by_value(tf.nn.softmax(o_rm), 1e-10, 1e10)

        loss += -tf.reduce_sum(gs_rm * tf.log(g))

    loss += tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()]) * beta

    opt = tf.train.AdamOptimizer(learning_rate).minimize(loss)

Although some hyper parameters are not the same in the code, when I run the code, they are set to be the same(eg,by python’s argparse).

At first glance it looks like pytorch code using 64 bit integer tensor and tensorflow using 32 bit float tensor

I gave up on trying to improve that, I double-checked almost everything and it is still with the same suboptimal results when compared to TensorFlow. I really think that Pytorch should have more tests on the convergence of different networks, because it seems that something very subtle is happening, and it is almost impossible to find the issue, even on very simple networks.

1 Like

I’ve had a different problem whereby if I trained with PyTorch on GPU, it was suboptimal compared to training the same thing on PyTorch with CPU. The problem was the CUDNN libraries used with GPU. Thus using torch.backends.cudnn.enabled = False solved the problem for me. Maybe it solves your problem too?

1 Like

hello, did you find resolution for the worse results on pytorch? I have met the same problem. thanks for your reply.