nn.L1Loss vs. Sklearn's l1 loss - different optimization results?

Hi, I was implementing L1 regularization with pytorch for feature selection and found that I have different results compared to Sklearn or cvxpy. Perhaps I am implementing nn.L1Loss incorrectly or maybe there is a better way to optimize (I tried both Adam and SGD with a few different lr)?

import numpy as np
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

import cvxpy as cp

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

import torch
import torch.nn as nn
import torch.optim as optim

# generate data
X,y, coef_true = make_regression(n_samples=200, n_features=10000, n_informative=10, 
                                 coef = True, random_state = 123)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

print(np.where(coef_true != 0)[0])
# [ 893 4422 4428 5284 5632 5975 6388 7586 8270 9597]

Using sklearn, I get the correct answer:

# sklearn lasso
lasso_sklearn = Lasso(alpha = 0.2, warm_start = True)
lasso_sklearn.coef_ = np.zeros(X_train.shape[1])
lasso_sklearn.fit(X_train, y_train)
coef_sklearn = lasso_sklearn.coef_
print(np.where(lasso_sklearn.coef_ != 0)[0])
# [ 893 4422 4428 5284 5632 5975 6388 7586 8270 9597]

Using pytorch, I get this answer:

# pytorch lasso
class lasso(nn.Module):
    def __init__(self, in_dim,):
        super(lasso, self).__init__()
        self.linear = nn.Linear(in_dim,1)
    def forward(self, X):
        return self.linear(X)

def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.zeros_(m.weight)

lasso_pytorch = lasso(X_train.shape[1])
lasso_pytorch.apply(weights_init)

l1_loss = nn.L1Loss(reduction = 'sum')
mse_loss = nn.MSELoss()
optimizer = optim.Adam(lasso_pytorch.parameters(), lr = 0.0001)
alpha = 0.20
n_epoch = 5000
loss_history = []

lasso_pytorch.train()
for epoch in tqdm_notebook(range(n_epoch)):
    optimizer.zero_grad()
    outputs = lasso_pytorch(torch.from_numpy(X_train).float(),)
    loss = 0.5 * mse_loss(outputs, torch.from_numpy(y_train.reshape(-1,1)).float())
    p = 0
    for param in lasso_pytorch.parameters():
        loss += alpha * l1_loss(param, torch.zeros_like(param))

    loss_history.append(loss)
    loss.backward()
    optimizer.step()
    
coef_pytorch = np.array(lasso_pytorch.linear.weight.data).squeeze()
print(np.argsort(-np.abs(coef_pytorch))[:10])
# [5141 2251  902 2848 5002 8925 9328 8084 1888 2208]

Using cvxpy, I also get the correct answer:

# cvxpy lasso
def loss_fn(X, Y, beta):
    return cp.norm2(cp.matmul(X, beta) - Y)**2

def regularizer(beta,):
    return cp.norm1(beta)

def objective_fn(X, Y, beta, alpha):
    return 0.5/(len(X)) * loss_fn(X, Y, beta) + alpha * regularizer(beta) # from sklearn lasso

coef = cp.Variable(X_train.shape[1])
coef.value = np.zeros(X_train.shape[1])
alpha = cp.Parameter(nonneg=True)
alpha.value = 0.2
problem = cp.Problem(cp.Minimize(objective_fn(X, y, coef, alpha,)))
problem.solve(solver = cp.ECOS, warm_start = True,)
coef_cvxpy = coef.value
print(np.where(coef.value > 0.01)[0])
# [ 893, 4422, 4428, 5284, 5632, 5975, 6388, 7586, 8270, 9597]
1 Like

Interesting comparison ! I’ve got the correct result only by using LBFGS optimizer :

lasso_pytorch = nn.Linear(X_train.shape[1], 1, bias=True)

mse_loss = nn.MSELoss(reduction='sum')
optimizer = optim.LBFGS(lasso_pytorch.parameters(), lr=1)
alpha = 0.2
n_epoch = 5000


lasso_pytorch.train()
for epoch in range(n_epoch):
    def closure():
        optimizer.zero_grad()
        outputs = lasso_pytorch(torch.FloatTensor(X_train))
        loss = 0.5 * mse_loss(outputs, torch.FloatTensor(y_train.reshape(-1,1)))

        for param in lasso_pytorch.parameters():
            loss += alpha *param.abs().sum()
        loss.backward()
        return loss
        
    optimizer.step(closure)
    
coef_pytorch = np.array(lasso_pytorch.weight.data).squeeze()
print(np.sort(np.argsort(-np.abs(coef_pytorch))[:10]))

It don’t know if it helps and would be happy to know why other optimizers don’t find the right coefs.

1 Like

I’ve run into the same issue – I can’t reproduce scikit-learn’s LASSO results in either pyTorch or TensorFlow using most optimizers. In fact, I get very similar (sub-optimal) results from both pyTorch and TensorFlow which has convinced me this is almost certainly an optimizer problem.

LBFGS is not working on my problem… after the first optimization step I get NaNs returned as output. No idea how to troubleshoot this.

IIRC LBFGS is a deterministic optimization algorithm. My observation has been that Adam and other stochastic optimization algorithms tend not to drive coefficients to zero and therefore find sub-optimal solutions. If anyone has any thoughts on how to get Adam or other stochastic optimizers to drive some coefficients closer to zero, I would be very interested.