Adding weight decay stops loss from decreasing

I am trying to learn pytorch by building a perceptron to classify data points. I thought it would be interesting to see the effect of adding weight decay on the results of the model.

For some reason, running the below code will lead to the loss plateauing after 5000 epochs:

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np

# set random seed
torch.manual_seed(0)
# define perceptron
class Perceptron(nn.Module):
    def __init__(self):
        super(Perceptron,self).__init__()
        self.linear = nn.Linear(in_features=2,out_features=1)
    
    def forward(self,x):
        x = self.linear(x)
        x = torch.sigmoid(x)
        return x
# create test data points
class testData(Dataset):
    def __init__(self):
        super(testData,self).__init__()
        # test_data = [([3,2],1),([1,1],0)]
        test_data = [([3,2],1),([3,4],1),([3,6],1),([1,1],0),([1,3],0),([1,5],0)]
        self.data = test_data
    
    def __getitem__(self,index):
        dp,label = self.data[index]
        dp = torch.FloatTensor(dp)
        label = torch.tensor(label)
        return dp,label
    
    def __len__(self):
        return len(self.data)

def main():
    # epochs at which we plot the model
    epoch_samples = [1000,2000,3000,4000,5000,6000,7000,8000,9000,10000]
    fig,(ax1,ax2) = plt.subplots(nrows=2)
    # plot the data points
    ax1.scatter([3],[2],color='b')
    ax1.scatter([1],[1],color='g')
    ax1.scatter([3,3],[4,6],color='b')
    ax1.scatter([1,1,1],[1,3,5],color='g')
    # instantiate the model, optimiser and dataloader 
    model = Perceptron()
    dataset = testData()
    dataloader = DataLoader(dataset,batch_size=2)
    optimiser = optim.SGD(model.parameters(),lr= 1,weight_decay=0.0001)
    # train for 100 epochs
    for epoch in range(10001):
        total_loss=0
        for idx,batch in enumerate(dataloader):
            dp,label = batch
            preds = model(dp)
            optimiser.zero_grad()
            loss = F.binary_cross_entropy(preds.float(),label.unsqueeze(1).float())
            loss.backward()
            optimiser.step()
            total_loss += loss.item()
        # add model to plot if epoch in epoch samples list
        if epoch in epoch_samples:
            weights= model.linear.weight.detach().numpy()
            w0 = weights[0,0]
            w1 = weights[0,1]
            bias = model.linear.bias.detach().numpy()[0]
            x = np.linspace(0,5,50)
            y = -((w0*x+bias)/w1)
            # set the alpha coeffecient based on position of epoch in list
            a = ((epoch_samples.index(epoch)+1)/len(epoch_samples))
            # plot model
            ax1.plot(x,y,color='r',alpha=a)
            # add the abs weight value against epoch data point
            ax2.scatter(epoch,[np.abs(w0)+np.abs(w1)],color='b')
            print(epoch,total_loss) 
    # output the plot
    plt.show()


if __name__ == "__main__":
    main()

Output

1000 0.009901054669171572
2000 0.008643533918075264
3000 0.008495371788740158
4000 0.008475587470456958
5000 0.008474839036352932
6000 0.008474839036352932
7000 0.008474839036352932
8000 0.008474839036352932
9000 0.008474839036352932
10000 0.008474839036352932

Why does adding weight decay introduce this behaviour? Setting the parameter to 0 gives me the following output:

1000 0.004912275762762874
2000 0.0024755716731306165
3000 0.001655233427300118
4000 0.0012433293013600633
5000 0.000995590366073884
6000 0.0008301999769173563
7000 0.0007119276633602567
8000 0.0006231760416994803
9000 0.0005540735000977293
10000 0.0004988349974155426