Keras model converge better than pytorch one, maybe batchnorm abnormal

ptrblck, thank you for your advice!
I updated my code according your adivce. but the result is almost the same. the best mse is 0.12xx. you can try it.
and initing batchnorm weight using the nn.init.normal_ is refer from https://discuss.pytorch.org/t/keras-gives-better-performance-in-both-training-speed-and-result-generalization-than-pytorch-in-simple-mlp-help/102904 It really works.
if i didn’t set the special random seed and init the liner weight using nn.init.normal_, the best mse will be 0.13xx~0.14xx

import numpy as np
import gc
import pandas as pd
from pandas import Series,DataFrame
import os,glob

import torch
from torch.utils.data.dataset import TensorDataset
from torch  import optim,nn

from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

import functools

device = 'cuda' if torch.cuda.is_available() else 'cpu'


with_bn = True

# hyper parameter
lr = 1.e-3
n_epochs = 500
bs = 32
valid_pct = 0.15
lam = 2.e-3


def load_data_t(filename:str, x_cols:list, y_col:str = None,shuffle=True):
    d = pd.read_csv(filename,sep='\t')
    x = get_data_t(d, x_cols,shuffle) 
    if y_col is not None: 
        return x, get_data_t(d,y_col)
    else :
        return x

def get_data_t(df,cols,shuffle=True):
    x = df[cols] 
    x = np.array(x.values)
    if shuffle :
        np.random.seed(202)
        np.random.shuffle(x) 
    x = torch.from_numpy(x)
    return x.float()


def l1_penalty(model, l1_lambda=lam) -> torch.Tensor:
    """Returns the L1 penalty of the params."""
    ss = [ p.abs().sum() for n, p in model.named_parameters() ]
    l1_norm = torch.stack(ss).sum()
    return l1_lambda*l1_norm

def l2_penalty(model, l2_lambda=1.5e-2) -> torch.Tensor:
    ss = [p.pow(2.0).sum() for p in model.parameters()]
    l2_norm = torch.stack(ss).sum()
    return l2_lambda*l2_norm


def make_train_step(model, optimizer,loss_fn, penalty_fn = None):
    # Builds function that performs a step in the train loop
    def train_step(x, y):
        optimizer.zero_grad()
        # Makes predictions
        yhat = model(x)
        # Computes loss
        loss = loss_fn(y, yhat)
        penalty = penalty_fn(model) if penalty_fn else 0
        # Computes gradients
        total_loss = loss+penalty
        total_loss.backward()
        # Updates parameters and zeroes gradients
        optimizer.step()
        # Returns the loss
        return loss.item(),penalty
    
    # Returns the function that will be called inside the train loop
    return train_step

def fit(dl:DataLoader,model, loss_fn, penalty_fn= None, optimizer=None):
    losses = []
    # Sets model to TRAIN mode
    model.train()
    train_step = make_train_step(model,  optimizer, loss_fn, penalty_fn)
    optimizer.zero_grad()
    for x_batch, y_batch in dl:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        loss,penalty = train_step(x_batch, y_batch)
        losses.append(loss)
    if len(losses) :
        return sum(losses)/len(losses),penalty
    else:
        return 0, penalty

def validate(val_loader,model,loss_fn):
    val_losses= []
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)

            model.eval()
            yhat = model(x_val)
            val_loss = loss_fn(y_val, yhat)
            val_losses.append(val_loss.item())
        if len(val_losses) :
            val_loss = sum(val_losses)/len(val_losses),penalty
        else:
            val_loss = 0
    return val_loss[0]

def predict(val_loader,model):
    ret = None
    with torch.no_grad():
        for [x_val] in val_loader:
            x_val = x_val.to(device)

            model.eval()
            yhat = model(x_val)
            ret = torch.cat((ret,yhat),0) if ret is not None else yhat
    return ret

class StreamRegression(nn.Module):
    def __init__(self,n_feature,n_hidden1,n_hidden2,n_hidden3):

        super(StreamRegression, self).__init__()
        self.layer = nn.Sequential()
        self.layer.add_module('l1', nn.Linear(n_feature, n_hidden1) )
        self.layer.add_module('relue 1', nn.ReLU() )
        if with_bn: 
            self.layer.add_module('bn 1', nn.BatchNorm1d(n_hidden1) )
        self.layer.add_module( 'd1', nn.Dropout() )
        self.layer.add_module( 'l2', nn.Linear(n_hidden1, n_hidden2) )
        self.layer.add_module( 'relu 2', nn.ReLU() )
        if with_bn: 
            self.layer.add_module( 'bn 2', nn.BatchNorm1d(n_hidden2) )
        self.layer.add_module( 'd2', nn.Dropout() )
        self.layer.add_module( 'l3', nn.Linear(n_hidden2, n_hidden3) )
        self.layer.add_module( 'relu 3', nn.ReLU() )
        if with_bn: 
            self.layer.add_module( 'bn 3', nn.BatchNorm1d(n_hidden3) )
        self.layer.add_module( 'd3', nn.Dropout() )
        self.layer.add_module( 'l4', nn.Linear(n_hidden3, 1) )
              
        if with_bn: 
            s = 6 # magic number
            torch.manual_seed(s)

            for name, param in self.layer.named_parameters():
                if "weight" in name:  # weight matrix
                    nn.init.normal_(param, std=0.05) 
                else:  # bias
                    nn.init.zeros_(param)


    def forward(self,x):
        x = self.layer(x)
        return x


x_cols = ['V0', 'V1', 'V2', 'V3', 'V4', 'V6', 'V7', 'V8', 'V10', 'V12', 'V13',
        'V15', 'V16', 'V18', 'V19', 'V20', 'V21', 'V23', 'V24', 'V25',
       'V26', 'V29', 'V30', 'V31', 'V32', 'V33', 'V35', 'V36', 'V37']

X_train, Y_train = load_data_t("steamprediction/dataset/zhengqi_train.txt",x_cols,"target")

train_ds = TensorDataset(X_train,Y_train.reshape(-1,1))
train_l = len(train_ds)
fit_l = round(train_l * valid_pct) # - round(train_l * valid_pct) % bs
valid_l = train_l - fit_l

# fit_ds, val_ds = random_split(train_ds, [fit_l, valid_l])
fit_ds = torch.utils.data.Subset(train_ds,range(0,fit_l))
val_ds = torch.utils.data.Subset(train_ds,range(fit_l,train_l))

fit_loader = DataLoader(dataset=fit_ds, batch_size=bs, shuffle=False)
val_loader = DataLoader(dataset=val_ds, batch_size=bs, shuffle=False)

model = StreamRegression(29,128,64,16)


print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())


# Defines a MSE loss function
loss_fn = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=lr)
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

# For each epoch...

print('{:<7}{:>10}{:>10}{:>10}'.format('No','train_loss','penalty','val_loss'))

best_loss = float('INF')

for epoch in range(n_epochs):
    # Performs one train step and returns the corresponding loss
    train_loss, penalty = fit(fit_loader,model, loss_fn, l1_penalty,optimizer)

    val_loss = validate(val_loader,model,loss_fn)

    print(f'{epoch:<7}{train_loss:>10.3f}{penalty:>10.3f}{val_loss:>10.3f}')

    if best_loss > val_loss :
        best_loss = val_loss
        torch.save(model,'torch_model.pkl')
        print(f"validate-loss declined to {best_loss}, save model!") 

print(f"The final validate-loss is  {best_loss}") 


X_test = load_data_t("steamprediction/dataset/zhengqi_test.txt",x_cols,shuffle=False)
test_ds = TensorDataset(X_test)
test_loader = DataLoader(dataset=test_ds, batch_size= bs, shuffle=False)

model = torch.load('torch_model.pkl')
result = predict(test_loader,model)
result_df = pd.DataFrame(result.numpy())
result_df.to_csv("pytorch_result.txt", sep='\t',index=False, header=None)