L1 regularization does not give sparse solution in neural networks

I’m using Pytorch to build a neural network with l1 norm regularization on each layer. I tried to add the penalty term to the loss function directly. However, the estimated parameters are shrunk but none of them is exactly zero. I’m not sure whether I need to manually apply the soft-thresholding function to make it sparse. Any ideas how to get sparse solution?

My code is pasted below.


class NeuralNet(nn.Module):
    """
    neural network class, with nn api
    """
    def __init__(self, input_size: int, hidden_size: List[int], output_size: int):
        """
        initialization function
        @param input_size: input data dimension
        @param hidden_size: list of hidden layer sizes, arbitrary length
        @param output_size: output data dimension
        """
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        """layers"""
        self.input = nn.Linear(self.input_size, self.hidden_size[0], bias=False)
        self.hiddens = nn.ModuleList([
            nn.Linear(self.hidden_size[h], self.hidden_size[h + 1]) for h in range(len(self.hidden_size) - 1)])
        self.output = nn.Linear(hidden_size[-1], output_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        forward propagation process, required by the nn.Module class
        @param x: the input data
        @return: the output from neural network
        """
        x = self.input(x)
        x = self.relu(x)
        for hidden in self.hiddens:
            x = hidden(x)
            x = self.relu(x)
        x = self.output(x)
        x = self.softmax(x)
        return x

def estimate(self, x, y, l1: bool = False, lam: float = None, learning_rate: float = 0.1, batch_size: int = 
    32, epochs: int = 50):
    """
    estimates the neural network model
    @param x: training data
    @param y: training label
    @param l1: whether to use l1 norm regularization
    @param lam: tuning parameter
    @param learning_rate: learning rate
    @param batch_size: batch size
    @param epochs: number of epochs
    @return: null
    """
    input_size = x.shape[1]
    hidden_size = [50, 30, 10]
    output_size = 2
    model = NeuralNet(input_size, hidden_size, output_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    trainset = []
    for i in range(x.shape[0]):
        trainset.append([x[i, :], y[i]])
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    for e in range(epochs):
        running_loss = 0
        for data, label in trainloader:
            input_0 = data.view(data.shape[0], -1)
            optimizer.zero_grad()
            output = model(input_0.float())
            loss = torch.nn.CrossEntropyLoss()(output, label.squeeze(1))
            if l1:
                if lam is None:
                    raise ValueError("lam needs to be specified when l1 is True.")
                else:
                    for w in model.parameters():
                        if w.dim() > 1:
                            loss = loss + lam * w.norm(1)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

I did the following to print the weights.

for i in model.parameters():
    print(i)