Keras model converge better than pytorch one, maybe batchnorm abnormal

I migrated a script from keras to pytorch . but i found pytorch one can’t converge as the original one. pytorch script got the best mse = 0.1250, but the keras script can get the best mse = 0.1066
btw, i remove batchnormal from both script( with_bn = True), then they got almost the same best mse score 0.14xx. So I guess there are something wrong with BatchNorm1d module in pytorch.
Can you help me?

Pytorch script

import numpy as np
import gc
import pandas as pd
from pandas import Series,DataFrame
import os,glob

import torch
from torch.utils.data.dataset import TensorDataset
from torch  import optim,nn

from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

import functools

device = 'cuda' if torch.cuda.is_available() else 'cpu'


with_bn = True

# hyper parameter
lr = 1.e-3
n_epochs = 500
bs = 32
valid_pct = 0.15
lam = 2.e-3


def load_data_t(filename:str, x_cols:list, y_col:str = None,shuffle=True):
    d = pd.read_csv(filename,sep='\t')
    x = get_data_t(d, x_cols,shuffle) 
    if y_col is not None: 
        return x, get_data_t(d,y_col)
    else :
        return x

def get_data_t(df,cols,shuffle=True):
    x = df[cols] 
    x = np.array(x.values)
    if shuffle :
        np.random.seed(202)
        np.random.shuffle(x) 
    x = torch.from_numpy(x)
    return x.float()


def l1_penalty(model, l1_lambda=lam) -> torch.Tensor:
    """Returns the L1 penalty of the params."""
    ss = [ p.abs().sum() for n, p in model.named_parameters() ]
    l1_norm = torch.stack(ss).sum()
    return l1_lambda*l1_norm

def l2_penalty(model, l2_lambda=1.5e-2) -> torch.Tensor:
    ss = [p.pow(2.0).sum() for p in model.parameters()]
    l2_norm = torch.stack(ss).sum()
    return l2_lambda*l2_norm


def make_train_step(model, optimizer,loss_fn, penalty_fn = None):
    # Builds function that performs a step in the train loop
    def train_step(x, y):
        optimizer.zero_grad()
        # Makes predictions
        yhat = model(x)
        # Computes loss
        loss = loss_fn(y, yhat)
        penalty = penalty_fn(model) if penalty_fn else 0
        # Computes gradients
        total_loss = loss+penalty
        total_loss.backward()
        # Updates parameters and zeroes gradients
        optimizer.step()
        # Returns the loss
        return loss.item(),penalty
    
    # Returns the function that will be called inside the train loop
    return train_step

def fit(dl:DataLoader,model, loss_fn, penalty_fn= None, optimizer=None):
    losses = []
    # Sets model to TRAIN mode
    model.train()
    train_step = make_train_step(model,  optimizer, loss_fn, penalty_fn)
    optimizer.zero_grad()
    for x_batch, y_batch in dl:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        loss,penalty = train_step(x_batch, y_batch)
        losses.append(loss)
    if len(losses) :
        return sum(losses)/len(losses),penalty
    else:
        return 0, penalty

def validate(val_loader,model,loss_fn):
    val_losses= []
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)

            model.eval()
            yhat = model(x_val)
            val_loss = loss_fn(y_val, yhat)
            val_losses.append(val_loss.item())
        if len(val_losses) :
            val_loss = sum(val_losses)/len(val_losses),penalty
        else:
            val_loss = 0
    return val_loss[0]

def predict(val_loader,model):
    ret = None
    with torch.no_grad():
        for [x_val] in val_loader:
            x_val = x_val.to(device)

            model.eval()
            yhat = model(x_val)
            ret = torch.cat((ret,yhat),0) if ret is not None else yhat
    return ret

class StreamRegression(nn.Module):
    def __init__(self,n_feature,n_hidden1,n_hidden2,n_hidden3):

        super(StreamRegression, self).__init__()
        self.layer = nn.Sequential()
        self.layer.add_module('l1', nn.Linear(n_feature, n_hidden1) )
        self.layer.add_module('relue 1', nn.ReLU() )
        if with_bn: 
            self.layer.add_module('bn 1', nn.BatchNorm1d(n_hidden1) )
        self.layer.add_module( 'd1', nn.Dropout() )
        self.layer.add_module( 'l2', nn.Linear(n_hidden1, n_hidden2) )
        self.layer.add_module( 'relu 2', nn.ReLU() )
        if with_bn: 
            self.layer.add_module( 'bn 2', nn.BatchNorm1d(n_hidden2) )
        self.layer.add_module( 'd2', nn.Dropout() )
        self.layer.add_module( 'l3', nn.Linear(n_hidden2, n_hidden3) )
        self.layer.add_module( 'relu 3', nn.ReLU() )
        if with_bn: 
            self.layer.add_module( 'bn 3', nn.BatchNorm1d(n_hidden3) )
        self.layer.add_module( 'd3', nn.Dropout() )
        self.layer.add_module( 'l4', nn.Linear(n_hidden3, 1) )
              
        if with_bn: 
            s = 6 # magic number
            np.random.seed(s)
            torch.manual_seed(s)

            for name, param in self.layer.named_parameters():
                if "weight" in name:  # weight matrix
                    nn.init.normal_(param, std=0.05) 
                else:  # bias
                    nn.init.zeros_(param)


    def forward(self,x):
        x = self.layer(x)
        return x


x_cols = ['V0', 'V1', 'V2', 'V3', 'V4', 'V6', 'V7', 'V8', 'V10', 'V12', 'V13',
        'V15', 'V16', 'V18', 'V19', 'V20', 'V21', 'V23', 'V24', 'V25',
       'V26', 'V29', 'V30', 'V31', 'V32', 'V33', 'V35', 'V36', 'V37']

X_train, Y_train = load_data_t("steamprediction/dataset/zhengqi_train.txt",x_cols,"target")

train_ds = TensorDataset(X_train,Y_train.reshape(-1,1))
train_l = len(train_ds)
fit_l = round(train_l * valid_pct) # - round(train_l * valid_pct) % bs
valid_l = train_l - fit_l

# fit_ds, val_ds = random_split(train_ds, [fit_l, valid_l])
fit_ds = torch.utils.data.Subset(train_ds,range(0,fit_l))
val_ds = torch.utils.data.Subset(train_ds,range(fit_l,train_l))

fit_loader = DataLoader(dataset=fit_ds, batch_size=bs, shuffle=False)
val_loader = DataLoader(dataset=val_ds, batch_size=bs, shuffle=False)

model = StreamRegression(29,128,64,16)


print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())


# Defines a MSE loss function
loss_fn = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=lr)
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

# For each epoch...

print('{:<7}{:>10}{:>10}{:>10}'.format('No','train_loss','penalty','val_loss'))

best_loss = float('INF')

for epoch in range(n_epochs):
    # Performs one train step and returns the corresponding loss
    train_loss, penalty = fit(fit_loader,model, loss_fn, l1_penalty,optimizer)

    val_loss = validate(val_loader,model,loss_fn)

    print(f'{epoch:<7}{train_loss:>10.3f}{penalty:>10.3f}{val_loss:>10.3f}')

    if best_loss > val_loss :
        best_loss = val_loss
        torch.save(model,'torch_model.pkl')
        print(f"validate-loss declined to {best_loss}, save model!") 

print(f"The final validate-loss is  {best_loss}") 


X_test = load_data_t("steamprediction/dataset/zhengqi_test.txt",x_cols,shuffle=False)
test_ds = TensorDataset(X_test)
test_loader = DataLoader(dataset=test_ds, batch_size= bs, shuffle=False)

model = torch.load('torch_model.pkl')
result = predict(test_loader,model)
result_df = pd.DataFrame(result.numpy())
result_df.to_csv("pytorch_result.txt", sep='\t',index=False, header=None)

Keras script

import numpy as np
import gc
import pandas as pd
from pandas import Series,DataFrame
import os,glob
from keras import optimizers
import keras
from keras.models import Sequential
from keras.layers import Dense,BatchNormalization,Dropout
from keras.optimizers import SGD,Adam
from keras import regularizers
from keras.layers.advanced_activations import LeakyReLU

from keras.callbacks import ModelCheckpoint,Callback

with_bn = True


def to_array(data):
  x = [np.array(data.loc[i]) for i in range (len(data))]
  return np.array(x)

# gamm = 3.e-2
# reg = regularizers.l2(gamm)

lam = 2.e-3
reg = regularizers.l1(lam)

def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer.lr
    return lr
optimizer = Adam() # 实际上的 lr = 0.001
lr_metric = get_lr_metric(optimizer)


model = Sequential()
model.add(Dense(input_shape=(29,),units=128, activation='relu', kernel_regularizer=reg, name = 'Dense1'))
if with_bn: 
    model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(units=64, activation='relu', kernel_regularizer=reg, name = 'Dense4'))
if with_bn: 
    model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(units=16, activation='relu', kernel_regularizer=reg, name = 'Dense5'))
if with_bn: 
    model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(units=1, activation='linear', kernel_regularizer=reg, name = 'Dense6'))
# sgd = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06)
model.compile(loss='MSE', optimizer=optimizer, metrics=['MSE',lr_metric])
model.summary()

pd_data = pd.read_csv("steamprediction/dataset/zhengqi_train.txt", sep='\t')
Y_train_set = pd_data['target']
# del pd_data['target']
pd_data = pd_data[['V0', 'V1', 'V2', 'V3', 'V4', 'V6', 'V7', 'V8', 'V10', 'V12', 'V13',
        'V15', 'V16', 'V18', 'V19', 'V20', 'V21', 'V23', 'V24', 'V25',
       'V26', 'V29', 'V30', 'V31', 'V32', 'V33', 'V35', 'V36', 'V37']]
X_train = to_array(pd_data)
Y_train = to_array(Y_train_set)
np.random.seed(202)
np.random.shuffle(X_train) 
np.random.seed(202)
np.random.shuffle(Y_train) 



callbacks_list = [
  ModelCheckpoint(
  filepath='my_model.h5',
  monitor="val_loss",
  save_best_only=True,
  verbose=1,
  )
]

history = model.fit(X_train, Y_train, epochs=500, batch_size=32, verbose=2, validation_split=0.15, shuffle=False, callbacks=callbacks_list)


import pandas as pd
import keras
from keras.models import Sequential,Model

from keras.models import load_model

pd_test = pd.read_csv("steamprediction/dataset/zhengqi_test.txt", sep='\t')
pd_test = pd_test[['V0', 'V1', 'V2', 'V3', 'V4', 'V6', 'V7', 'V8', 'V10', 'V12', 'V13',
        'V15', 'V16', 'V18', 'V19', 'V20', 'V21', 'V23', 'V24', 'V25',
       'V26', 'V29', 'V30', 'V31', 'V32', 'V33', 'V35', 'V36', 'V37']]

print(pd_test)
# stock_model = load_model("my_model.h5",custom_objects=[lr_metric])
stock_model = load_model("my_model.h5",custom_objects={lr_metric.__name__:lr_metric})
stock = stock_model.predict_on_batch(pd_test)
stock_re = DataFrame(stock)
stock_re.to_csv("keras_result.txt", sep='\t',index=False, header=None)
print(stock)

You can get the training data and test data from https://drive.google.com/drive/folders/1O88OKsVIqYMAuDgePnIYBXf0r2TF3MTh?usp=sharing

In your PyTorch model you are adding the nn.BatchNorm1d layers as b1, b2, b3.
Afterwards it seems you would like to keep the original parameter initialization for these layers and try to filter them out via if "bn" in name. However, since the name is wrong, the batchnorm weights will also be initialized using the nn.init.normal_ operation, which could explain the worse convergence.

I would recommend to use bn1, bn2 etc. as the new names instead of checking if "b" in name, since the latter would also match the bias parameter in other layers.

ptrblck, thank you for your advice!
I updated my code according your adivce. but the result is almost the same. the best mse is 0.12xx. you can try it.
and initing batchnorm weight using the nn.init.normal_ is refer from https://discuss.pytorch.org/t/keras-gives-better-performance-in-both-training-speed-and-result-generalization-than-pytorch-in-simple-mlp-help/102904 It really works.
if i didn’t set the special random seed and init the liner weight using nn.init.normal_, the best mse will be 0.13xx~0.14xx

import numpy as np
import gc
import pandas as pd
from pandas import Series,DataFrame
import os,glob

import torch
from torch.utils.data.dataset import TensorDataset
from torch  import optim,nn

from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

import functools

device = 'cuda' if torch.cuda.is_available() else 'cpu'


with_bn = True

# hyper parameter
lr = 1.e-3
n_epochs = 500
bs = 32
valid_pct = 0.15
lam = 2.e-3


def load_data_t(filename:str, x_cols:list, y_col:str = None,shuffle=True):
    d = pd.read_csv(filename,sep='\t')
    x = get_data_t(d, x_cols,shuffle) 
    if y_col is not None: 
        return x, get_data_t(d,y_col)
    else :
        return x

def get_data_t(df,cols,shuffle=True):
    x = df[cols] 
    x = np.array(x.values)
    if shuffle :
        np.random.seed(202)
        np.random.shuffle(x) 
    x = torch.from_numpy(x)
    return x.float()


def l1_penalty(model, l1_lambda=lam) -> torch.Tensor:
    """Returns the L1 penalty of the params."""
    ss = [ p.abs().sum() for n, p in model.named_parameters() ]
    l1_norm = torch.stack(ss).sum()
    return l1_lambda*l1_norm

def l2_penalty(model, l2_lambda=1.5e-2) -> torch.Tensor:
    ss = [p.pow(2.0).sum() for p in model.parameters()]
    l2_norm = torch.stack(ss).sum()
    return l2_lambda*l2_norm


def make_train_step(model, optimizer,loss_fn, penalty_fn = None):
    # Builds function that performs a step in the train loop
    def train_step(x, y):
        optimizer.zero_grad()
        # Makes predictions
        yhat = model(x)
        # Computes loss
        loss = loss_fn(y, yhat)
        penalty = penalty_fn(model) if penalty_fn else 0
        # Computes gradients
        total_loss = loss+penalty
        total_loss.backward()
        # Updates parameters and zeroes gradients
        optimizer.step()
        # Returns the loss
        return loss.item(),penalty
    
    # Returns the function that will be called inside the train loop
    return train_step

def fit(dl:DataLoader,model, loss_fn, penalty_fn= None, optimizer=None):
    losses = []
    # Sets model to TRAIN mode
    model.train()
    train_step = make_train_step(model,  optimizer, loss_fn, penalty_fn)
    optimizer.zero_grad()
    for x_batch, y_batch in dl:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        loss,penalty = train_step(x_batch, y_batch)
        losses.append(loss)
    if len(losses) :
        return sum(losses)/len(losses),penalty
    else:
        return 0, penalty

def validate(val_loader,model,loss_fn):
    val_losses= []
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)

            model.eval()
            yhat = model(x_val)
            val_loss = loss_fn(y_val, yhat)
            val_losses.append(val_loss.item())
        if len(val_losses) :
            val_loss = sum(val_losses)/len(val_losses),penalty
        else:
            val_loss = 0
    return val_loss[0]

def predict(val_loader,model):
    ret = None
    with torch.no_grad():
        for [x_val] in val_loader:
            x_val = x_val.to(device)

            model.eval()
            yhat = model(x_val)
            ret = torch.cat((ret,yhat),0) if ret is not None else yhat
    return ret

class StreamRegression(nn.Module):
    def __init__(self,n_feature,n_hidden1,n_hidden2,n_hidden3):

        super(StreamRegression, self).__init__()
        self.layer = nn.Sequential()
        self.layer.add_module('l1', nn.Linear(n_feature, n_hidden1) )
        self.layer.add_module('relue 1', nn.ReLU() )
        if with_bn: 
            self.layer.add_module('bn 1', nn.BatchNorm1d(n_hidden1) )
        self.layer.add_module( 'd1', nn.Dropout() )
        self.layer.add_module( 'l2', nn.Linear(n_hidden1, n_hidden2) )
        self.layer.add_module( 'relu 2', nn.ReLU() )
        if with_bn: 
            self.layer.add_module( 'bn 2', nn.BatchNorm1d(n_hidden2) )
        self.layer.add_module( 'd2', nn.Dropout() )
        self.layer.add_module( 'l3', nn.Linear(n_hidden2, n_hidden3) )
        self.layer.add_module( 'relu 3', nn.ReLU() )
        if with_bn: 
            self.layer.add_module( 'bn 3', nn.BatchNorm1d(n_hidden3) )
        self.layer.add_module( 'd3', nn.Dropout() )
        self.layer.add_module( 'l4', nn.Linear(n_hidden3, 1) )
              
        if with_bn: 
            s = 6 # magic number
            torch.manual_seed(s)

            for name, param in self.layer.named_parameters():
                if "weight" in name:  # weight matrix
                    nn.init.normal_(param, std=0.05) 
                else:  # bias
                    nn.init.zeros_(param)


    def forward(self,x):
        x = self.layer(x)
        return x


x_cols = ['V0', 'V1', 'V2', 'V3', 'V4', 'V6', 'V7', 'V8', 'V10', 'V12', 'V13',
        'V15', 'V16', 'V18', 'V19', 'V20', 'V21', 'V23', 'V24', 'V25',
       'V26', 'V29', 'V30', 'V31', 'V32', 'V33', 'V35', 'V36', 'V37']

X_train, Y_train = load_data_t("steamprediction/dataset/zhengqi_train.txt",x_cols,"target")

train_ds = TensorDataset(X_train,Y_train.reshape(-1,1))
train_l = len(train_ds)
fit_l = round(train_l * valid_pct) # - round(train_l * valid_pct) % bs
valid_l = train_l - fit_l

# fit_ds, val_ds = random_split(train_ds, [fit_l, valid_l])
fit_ds = torch.utils.data.Subset(train_ds,range(0,fit_l))
val_ds = torch.utils.data.Subset(train_ds,range(fit_l,train_l))

fit_loader = DataLoader(dataset=fit_ds, batch_size=bs, shuffle=False)
val_loader = DataLoader(dataset=val_ds, batch_size=bs, shuffle=False)

model = StreamRegression(29,128,64,16)


print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())


# Defines a MSE loss function
loss_fn = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=lr)
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

# For each epoch...

print('{:<7}{:>10}{:>10}{:>10}'.format('No','train_loss','penalty','val_loss'))

best_loss = float('INF')

for epoch in range(n_epochs):
    # Performs one train step and returns the corresponding loss
    train_loss, penalty = fit(fit_loader,model, loss_fn, l1_penalty,optimizer)

    val_loss = validate(val_loader,model,loss_fn)

    print(f'{epoch:<7}{train_loss:>10.3f}{penalty:>10.3f}{val_loss:>10.3f}')

    if best_loss > val_loss :
        best_loss = val_loss
        torch.save(model,'torch_model.pkl')
        print(f"validate-loss declined to {best_loss}, save model!") 

print(f"The final validate-loss is  {best_loss}") 


X_test = load_data_t("steamprediction/dataset/zhengqi_test.txt",x_cols,shuffle=False)
test_ds = TensorDataset(X_test)
test_loader = DataLoader(dataset=test_ds, batch_size= bs, shuffle=False)

model = torch.load('torch_model.pkl')
result = predict(test_loader,model)
result_df = pd.DataFrame(result.numpy())
result_df.to_csv("pytorch_result.txt", sep='\t',index=False, header=None)

@macgaf Have you tried checking the default values? For example Keras’ BatchNormalization momentum is set to default value of 0.99 (ref), while PyTorch’s is set to default value of 0.1 (ref) - you should set it to 0.01 in order to achieve close results. I would reccomend checking other parameters as well.