Loss wont converge for OR gate perceptron in plain python

I am coding a perceptron from scratch just out of curiosity in plain python for OR gate, but a loss won’t converge.

import numpy as np
class Perceptron:
    def __init__(self):
        self.learning_rate = 0.01
        self.sigmoid = torch.nn.Sigmoid()
        
        # initializing weights
        self.w1, self.w2, self.bias = 0.01, 0.03, 0.05
      
    def predict(self, inputs):
        x1, x2 = inputs
        logits = (x1 * self.w1) + (x2 * self.w2) + self.bias
        predictions = self.sigmoid(torch.tensor(logits))
        return predictions
    
    def fit(self, training_inputs, targets, epochs=10000):
        for epoch in range(epochs):
            loss = 0
            gradient_0 = []
            gradient_1 = []
            gradient_2_w1 = []
            gradient_2_w2 = []
            
            for training_input, target in zip(training_inputs, targets):
                x1, x2 = training_input
                logits = (x1 * self.w1) + (x2 * self.w2) + self.bias
                prediction = self.sigmoid(torch.tensor(logits)).numpy()
                
                # sum of squared residuals, alternatively you can use mean squared error 
                loss += self.calculate_loss(target, prediction)
                
                # Accumulating gradients
                d_loss_and_d_prediction = -2 * (target - prediction)
                d_sigmoid_and_d_logits = logits * (1 - logits)
                gradient_0.append(d_loss_and_d_prediction)
                gradient_1.append(d_sigmoid_and_d_logits)
                gradient_2_w1.append(x1)
                gradient_2_w2.append(x2)
            
            print("loss: ", loss)
            
            for i in range(len(gradient_0)):
                d_loss_and_d_w1 = gradient_2_w1[i] * gradient_0[i] * gradient_1[i]
                d_loss_and_d_w2 = gradient_2_w2[i] * gradient_0[i] * gradient_1[i]

                # calculate_step_size
                step_size_w1 = d_loss_and_d_w1 * self.learning_rate 
                step_size_w2 = d_loss_and_d_w2 * self.learning_rate

                # update weights
                self.w1 -= step_size_w1
                self.w2 -= step_size_w2

    
    def calculate_loss(self, target, prediction):
        return (target - prediction)**2

model = Perceptron()
training_inputs = [[1., 1.], [1., 0.], [0., 1.], [0., 0.]]
targets = [1., 1., 1., 0.]
model.fit(training_inputs, targets)

However when I try the same with pytorch, it works

class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.FC1 = torch.nn.Linear(2, 1)
        
    def forward(self, training_inputs):
        return F.sigmoid(self.FC1(training_inputs))
        
        
model = Model()
model = model.train()

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.0)
training_inputs = torch.tensor([[1, 1], [1, 0], [0, 1], [0, 0]], dtype = torch.float32)
targets = torch.tensor([[1], [1], [1], [0]], dtype = torch.float32)

def calculate_loss(target, prediction):
    loss = 0
    for target, prediction in zip(targets, predictions):
        loss += (target - prediction)**2
    return loss

epochs = 10000
for epoch in range(epochs):
    optimizer.zero_grad()
    predictions = model(training_inputs)
    loss = calculate_loss(targets, predictions)
    loss.backward()
    optimizer.step()
    print(loss)

I would recommend using the same initial values in your PyTorch model via:

with torch.no_grad():
    model.FC1.weight.copy_(torch.tensor([[0.01, 0.03]]))
    model.FC1.bias.copy_(torch.tensor([0.05]))

Afterwards you could compare the loss values as well as the gradient to debug the issue further.
Also, in your numpy model you are not updating the bias which might already cause the divergence.