MLP Scratch Implementation for Classification on MNIST

My Model is not learning at all means i think weights are not updating plz help me in this plz
Code goes like

## custum dataset
import torch
import idx2numpy
import numpy as np
import matplotlib.pyplot as plt
    
class DatasetScratch:
    def __init__(self, image_file, label_file):
        self.images = torch.tensor(idx2numpy.convert_from_file(image_file)) # images from idx file --> to tensor
        self.labels = torch.tensor(idx2numpy.convert_from_file(label_file))# corresponding labels from idx file  -->to tensor
        self.length = len(self.labels)

    def __getitem__(self, index):
        return self.images[index], self.labels[index]

    def __len__(self):
        return self.length
    
scratch_dataset_train = DatasetScratch("train-images.idx3-ubyte","train-labels.idx1-ubyte")

## Scratch DataLoader
class DataLoaderScratch:
    def __init__(self,dataset,batchsize,shuffle=True):
        self.dataset  = dataset
        self.batchsize = batchsize
        self.shuffle = shuffle   
    def __iter__(self):
        if self.shuffle:
            indices  = torch.randperm(len(self.dataset))
        else:
            indices =torch.arange(len(self.dataset))
        # Creating batches
        for start_index in range(0,len(self.dataset),self.batchsize):
            end_index = start_index + self.batchsize
            batch_indices = indices[start_index:end_index]
            images,labels = self.dataset[batch_indices]
            #print(images.shape)
            batch = images,labels
            yield batch

scratch_loader = DataLoaderScratch(scratch_dataset_train,64)

Here mine whole Model

import torch
import numpy as np

def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def relu(x):
    return torch.max(torch.tensor(0.0), x)

# def relu_derivative(x):
#     ## Creating a mask where the value of output z  = wx+b > 0
#     mask = (x > 0).float() # binary to float
#     return mask
    
def softmax(x):
    exp_values = torch.exp(x - torch.max(x, dim=1, keepdim=True)[0])
    sum_exp_values = torch.sum(exp_values, dim=1, keepdim=True)
    softmax_output = exp_values / sum_exp_values
    return softmax_output
    
    
# def categorical_crossentropy(y_true, y_pred):
#     return torch.nn.functional.nll_loss(torch.nn.functional.log_softmax(y_pred, dim=1), torch.argmax(y_true, dim=1))
# def categorical_crossentropy(y_true, y_pred):
#     epsilon = 1e-15
#     y_pred = torch.clip(y_pred, epsilon, 1 - epsilon)
#     return -torch.sum(y_true * torch.log(y_pred + epsilon)) / len(y_true)

def categorical_crossentropy(y_true, y_pred):
    # Clip y_pred to avoid log(0) issues
    epsilon = 1e-15
    y_pred = torch.clamp(y_pred, epsilon, 1 - epsilon)

    # Calculate cross-entropy loss
    loss = -torch.sum(y_true * torch.log(y_pred)) / len(y_true)

    return loss

# def categorical_crossentropy(y_true, y_pred):
#     return torch.nn.functional.cross_entropy(y_pred, torch.argmax(y_true, dim=1))

def normalize_image_tensor(image_tensor):
    # Convert to floating-point if not already
    image_tensor = image_tensor.float()

    # Perform min-max normalization
    min_value = torch.min(image_tensor)
    max_value = torch.max(image_tensor)
    normalized_tensor = (image_tensor - min_value) / (max_value - min_value)

    return normalized_tensor

def relu_derivative(x):
    return torch.where(x < 0, torch.tensor(0.0), torch.tensor(1.0))   

def accuracy(output, y):
    _, predicted = torch.max(output, 1)
   # print(predicted)
    # Compare the predicted class indices with the true labels
    correct = (predicted == y).float()

    # Calculate the accuracy
    accuracy = correct.sum() / len(correct)

    return accuracy.item()  # Return as a Python float
    

class NeuralNetwork:
    def __init__(self, layers):
        self.layers = layers
        # Initialize all weights with Xavier/Glorot initialization
        self.weights = [ torch.randn(layers[i], layers[i+1]) / np.sqrt(layers[i]) for i in range(len(layers)-1)]
        self.biases = [torch.zeros((1, layers[i+1])) for i in range(len(layers)-1)]

        for weights in self.weights:
            print(weights.shape)

    def forward(self, X, activation_fun, softmax):
        self.activations = [X]
        
        self.linear_outputs = []
        for i in range(len(self.layers)-2):
            # Z = X @ w + biases
            self.linear_outputs.append(self.activations[-1] @ self.weights[i] + self.biases[i])
            # a = activation_fun(Z)
            self.activations.append(activation_fun(self.linear_outputs[-1]))

        # Output layer with softmax activation
        self.linear_outputs.append(self.activations[-1] @ self.weights[-1] + self.biases[-1])
        self.activations.append(softmax(self.linear_outputs[-1]))
        return self.activations[-1]
  

    def backward(self, X, y, learning_rate, activation_fun_derivative, clip_value=10.0):
        output_error = self.activations[-1] - y
        output_delta = output_error

        # Update weights and biases for the output layer with gradient clipping
        dwL = (self.activations[-2].T @ output_delta)
        dwL_clipped = torch.clamp(dwL, -clip_value, clip_value)
        self.weights[-1] = self.weights[-1] - learning_rate * dwL_clipped

        dbL = torch.sum(output_delta, axis=0, keepdim=True)
        dbL_clipped = torch.clamp(dbL, -clip_value, clip_value)
        self.biases[-1] = self.biases[-1] - learning_rate * dbL_clipped

        for i in range(len(self.weights)-2, 0, -1):
            hidden_error = output_delta @ self.weights[i+1].T
            hidden_delta = activation_fun_derivative(self.activations[i]) * hidden_error

            # Update weights and biases for hidden layers with gradient clipping
            dwi = self.activations[i].T @ hidden_delta
            dwi_clipped = torch.clamp(dwi, -clip_value, clip_value)
            self.weights[i] = self.weights[i] - learning_rate * dwi_clipped

            dbi = torch.sum(hidden_delta, axis=0, keepdim=True)
            dbi_clipped = torch.clamp(dbi, -clip_value, clip_value)
            self.biases[i] = self.biases[i] - learning_rate * dbi_clipped

            output_delta = hidden_delta

        dw0 = (X.T @ output_delta)
        dw0_clipped = torch.clamp(dw0, -clip_value, clip_value)
        self.weights[0] = self.weights[0] - learning_rate * dw0_clipped

        db0 = torch.sum(output_delta, axis=0, keepdim=True)
        db0_clipped = torch.clamp(db0, -clip_value, clip_value)
        self.biases[0] = self.biases[0] - learning_rate * db0_clipped

        db0 = torch.sum(output_delta, axis=0, keepdim=True)
        self.biases[0] = self.biases[0] - learning_rate * db0  # Update this line


    def train(self,learning_rate, activation_fun, softmax, activation_fun_derivative,epochs,DataLoader,Normalize):
        for epoch in range(epochs):
            total_loss=0
            criterion = categorical_crossentropy
            # i=0
            totalAcc =0
            for X,y in DataLoader:
                X =Normalize(X.view(X.shape[0],-1).float())
                # one hot encoding 
                num_class= 10
                one_hot_encode = torch.eye(num_class)[y.squeeze().long()]
              
                output = self.forward(X, activation_fun, softmax)
                accur = accuracy(output,y.squeeze().long())*100
                print("accurr--> ",accur)
                totalAcc+=accur
                loss= criterion(one_hot_encode,output)
                total_loss+= loss
                # print(f"loss --> {loss:.2f}")
                self.backward(X, one_hot_encode, learning_rate, activation_fun_derivative)
                # i+=1
                # if(i==30):
                #     break
            # print("accuracy -> ",totalAcc/937)
            print("loss ->",total_loss/937)

    # Example usage
input_size = 784  # MNIST input size
hidden_size = 32
output_size = 10
learning_rate = 0.003
epochs = 10

model_scratch = NeuralNetwork([input_size, hidden_size, hidden_size, hidden_size, hidden_size, output_size])
model_scratch.train(learning_rate,relu,softmax,relu_derivative,epochs,scratch_loader,normalize_image_tensor)
def normalize_image_tensor(image_tensor):
    # Convert to floating-point if not already
    image_tensor = image_tensor.float()

    # Perform min-max normalization
    min_value = torch.min(image_tensor)
    max_value = torch.max(image_tensor)
    normalized_tensor = (image_tensor - min_value) / (max_value - min_value)

    return normalized_tensor

please if anyone can find mistakes and point out all those mistakes would be very helpful for me.
Thanks