LBFGS Sklearn difference Problem

I am trying to implement Sklearn’s MLP network with LBFGS optimizer to Solve regression problem, but there seems to be a problem with my code as the MAE is different from what I get in Sklearn’s MLP
This is my MLP code and my Pytorch implementation.

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(7, 3), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=40, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as utils_data
from torch.autograd import Variable

np.random.seed(29)
torch.manual_seed(29)
torch.cuda.manual_seed(29)

def error_analysis_torch(estimates, actual, title=''):
    arr = estimates - actual
    err_per = arr/actual
    
    mae = torch.abs(arr).mean()
    std = torch.std(arr)
    err_per_std = torch.std(err_per)
    mape = 100 * (torch.abs(arr) / actual)
    accuracy = 100 - torch.mean(mape)
    print('Results :')
    print(accuracy, mae)



features_Pytorch = np.array(train_features)
labels_Pytorch = np.array(train_labels)
inputs = torch.from_numpy(features_Pytorch)#.cuda()
targets = torch.from_numpy(labels_Pytorch)#.cuda()


features_Pytorch_test = np.array(test_features)
labels_Pytorch_test = np.array(test_labels)
inputs_test = torch.from_numpy(features_Pytorch_test)#.cuda()
targets_test = torch.from_numpy(labels_Pytorch_test)#.cuda()


class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        out = self.fc3(F.relu(self.fc2(F.relu(self.fc1(x)))))
        return out

input_size = inputs.size()[1]
hidden_size1 = 7
hidden_size2 = 3
output_size = 1
num_epoch = 20

model = MLP(input_size = input_size, hidden_size1 = hidden_size1, hidden_size2 = hidden_size2,
            output_size = output_size)

optimizer = LBFGS(model.parameters(), lr=0.1)
criterion = nn.MSELoss()
training_samples = utils_data.TensorDataset(inputs, targets)
data_loader_trn = utils_data.DataLoader(training_samples, batch_size=32, drop_last=False, shuffle=False)
#train
for epoch in range(num_epoch):
        print('STEP: ', epoch)
        for batch_idx, (data, target) in enumerate(data_loader_trn):
            tr_x, tr_y = data.float(), target.float()
            def closure():
                optimizer.zero_grad()
                out = model(tr_x)
                loss = criterion(out, tr_y.unsqueeze(1))
                print('loss:', loss.item())
                loss.backward()
                return loss
        optimizer.step(closure)
        with torch.no_grad():
            pred = model(inputs_test.float())
            loss = criterion(pred, targets_test.float())
            print('test loss:', loss.item())
            y = pred.detach().numpy()
pred_y = model(inputs_test.float()) 
error_analysis_torch(pred_y, targets_test.float())

The input is of a shape (1103, 61)
where the 61 columns are 2 numbers columns and 59 columns of one hot encoded data

The output MAE from Sklearn = 928818
The output from PyTorch = 1883490
I am also using Google Colab
I also used the same random seed for both experiments

I’m not sure what all arguments in MLPRegressor stand for, but note that sklearn uses often regularization etc. by default, so you would need to add these techniques also to your PyTorch model.

Thank you @ptrblck , How can I apply reularization using LBFGS optimizer as if I understand correctly the Weight decay term is the regularization term in Pytorch but LBFGS doesnt have a weight decay term.

Regularization was just an example of previous missing features trying to match sklearn models.

I was rather referring to these arguments:

alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, epsilon=1e-08,
max_fun=15000, max_iter=40, momentum=0.9,
n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,