Changing the for loops in my custom convolution layer

Hello
I have implemented my convolution layer and changed it’s forward function like this. Because I have to perform certain operations on the input matrix layer and weight, I have to use several For loops, but this has made the execution speed of the LeNeT-5 network, which is a small network, very low. How can I remove these Fore loops?

class MyConv2d1(nn.Conv2d):
def init(self,in_channels, out_channels, kernel_size, stride=1, padding=0):
super(MyConv2d1, self).init(in_channels, out_channels, kernel_size, stride, padding)
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding

    # Initialize weights and biases
    self.weights = layer_weights1
    self.biases = layer_bias1



def forward(self, input):
    batch_size, in_channels, in_height, in_width = input.shape


    # Calculate output dimensions
    out_height = int((in_height + 2 * self.padding - self.kernel_size) / self.stride + 1)
    out_width = int((in_width + 2 * self.padding - self.kernel_size) / self.stride + 1)

    # Pad input
    input_padded = torch.nn.functional.pad(input, (self.padding, self.padding, self.padding, self.padding))

    # Initialize output
    output = torch.zeros((batch_size, self.out_channels, out_height, out_width))

    # Perform convolution

    unfoldedconv1=torch.nn.functional.unfold(input_padded,(self.kernel_size,self.kernel_size))
    inputfold=torch.tensor(unfoldedconv1)
    inputfold=unfoldedconv1.transpose(1,2)
    inputfold.apply_(FloatToBinary)

    outputfold=torch.zeros((batch_size,inputfold.shape[1],self.out_channels))

    weightfold=self.weights.view(self.weights.size(0),-1).t()
    weightfold2=torch.tensor(weightfold)
    weightfold2.apply_(FloatToBinaryThenInt)


    for b in range(batch_size):
        for outchannel in range(self.out_channels):
            for i in range(inputfold.shape[1]):
                sum=0
                for j in range(inputfold.shape[2]):
                    weightValue=abs(int(weightfold2[j][outchannel]))
                    SN=torch.zeros((1,weightValue))
                    SN=BISC(int(inputfold[b][i][j]))
                    partSN=torch.tensor(SN[0:weightValue])

                    if weightfold2[j][outchannel]<0:
                        sum-=torch.count_nonzero(partSN)
                    else:
                        sum+=torch.count_nonzero(partSN)

                outputfold[b][i][outchannel]=sum*(2**length)
        outputfold/=2**(2*length)


    outputfold=outputfold.transpose(1,2)

    output=torch.nn.functional.fold(outputfold,(out_height,out_width),(1,1))

    return output

These four for loops perform the work of the matmul function, but in such a way that instead of multiplying in matmul, my function is used.
Can I replace the multiplication operation in the matmul function with my own function?