Big gap between keras result and pytorch result

I am reproduce the keras code from https://colab.research.google.com/github/yaringal/multi-task-learning-example/blob/master/multi-task-learning-example.ipynb

It has an input X, and produce two output Y1 and Y2. It used L2 loss and Adam for optimization. The weight log_var is learnable and after training, it should near 10 and 1. However, my code only provides 8 and 3. What is happening in my code? It looks the log_var does not update well

This is my code.

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import itertools

def gen_data(N):
    X = np.random.randn(N, Q)
    w1 = 2.
    b1 = 8.
    sigma1 = 1e1  # ground truth
    Y1 = X.dot(w1) + b1 + sigma1 * np.random.randn(N, D1)
    w2 = 3
    b2 = 3.
    sigma2 = 1e0  # ground truth
    Y2 = X.dot(w2) + b2 + sigma2 * np.random.randn(N, D2)
    return X, Y1, Y2

class Net (nn.Module):
    def __init__(self, nf_in, nf_in_out1, nf_out1, nf_out2):
        super(Net, self).__init__()
        self.nf_out1 = nf_out1
        self.nf_out2 = nf_out2
        self.FC0 = nn.Linear(nf_in, nf_in_out1)
        self.FC1 = nn.Linear(nf_in_out1, nf_out1)
        self.FC2 = nn.Linear(nf_in_out1, nf_out2)

    def forward (self, input):
        input = input.view(input.size(0), -1)
        input = F.relu(self.FC0(input))
        y_pred1 = self.FC1(input)
        y_pred2 = self.FC2(input)
        return y_pred1, y_pred2

# Custom loss layer
class CustomMultiLossLayer(nn.Module):
    def __init__(self, nb_outputs=2):      
        super(CustomMultiLossLayer, self).__init__()
        self.nb_outputs = nb_outputs
        #self.log_vars = nn.Parameter(torch.zeros(nb_outputs))
        self.log_vars1 = torch.nn.Parameter(torch.FloatTensor([0]))
        self.log_vars2 = torch.nn.Parameter(torch.FloatTensor([0]))
        self.mse = nn.MSELoss()
    def forward(self, ys_true1, ys_pred1,  ys_true2, ys_pred2):
        loss = torch.exp(-self.log_vars1) * self.mse(ys_pred1, ys_true1) + self.log_vars1 + torch.exp(-self.log_vars2) * self.mse(ys_pred2, ys_true2) + self.log_vars2
        print (torch.exp((self.log_vars1.data)**0.5), torch.exp((self.log_vars2.data)**0.5), loss.item(),self.log_vars2.item())
        return loss


N = 100
nb_epoch = 2000
batch_size = 20
nb_features = 1024
Q = 1
D1 = 1  # first output
D2 = 1  # second output
X, Y1, Y2 = gen_data(N)

net = Net(Q, nb_features, D1, D2)
multi_loss = CustomMultiLossLayer()
optimizer = optim.Adam(itertools.chain(net.parameters(),multi_loss.parameters()))


for epoch in range (nb_epoch):
    XY = list(zip(X,Y1,Y2))
    np.random.shuffle(XY)
    X,Y1,Y2 = zip(*XY)
    X = np.array(X)
    Y1 = np.array(Y1)
    Y2 = np.array(Y2)
    for i in range (5):
        #index =  np.random.choice(N, size=batch_size) 
        input = X[i*batch_size:batch_size*(i+1)]
        y1 = Y1[i*batch_size:batch_size*(i+1)]
        y2 = Y2[i*batch_size:batch_size*(i+1)]
        input = torch.from_numpy(input).float()
        y1 = torch.from_numpy(y1).float()
        y2 = torch.from_numpy(y2).float()
        y_pred1, y_pred2 = net(input)
        optimizer.zero_grad()
        loss = multi_loss(y1, y_pred1, y2, y_pred2)
        loss.backward()
        optimizer.step()

This is google colab link https://colab.research.google.com/drive/1_zsmQguerz0iy0J9Uu2Cs7oEHhj0QoXH