How can I get broadcasting benefits?

I have a simple custom NN as below. it works but is very slow. every epoch was accomplished in about 27 minutes !!! how can I vectorize and utilize broadcasting in my implementation?

class MaxSigNet(nn.Module):

    def __init__(self, in_channels, out_channels):
        super(MaxSigNet, self).__init__()
       
        mywheights=torch.tensor([[[[(0), (15/255.0), (3/255.0), (4/255.0), (0)],
               [(37/255.0), (1/255.0), (2/255.0), (4/255.0), (6/255.0)],
               [(65/255.0), (128/255.0), (0), (8/255.0), (9/255.0)],
               [(68/255.0), (64/255.0), (32/255.0), (16/255.0), (15/255.0)],
               [(0), (58/255.0), (37/255.0), (24/255.0), (0)]]]],dtype=torch.float32,requires_grad=True)
                
        mymaps=torch.tensor([[[[0,1,1,1,0],
                               [1,1,1,1,1],
                               [1,1,0,1,1],
                               [1,1,1,1,1],
                               [0,1,1,1,0]]]],dtype=torch.float32,requires_grad=False)
        
        self.weight=nn.Parameter(torch.mul(mywheights,mymaps))
        print("my initial weights are:",self.weight)
        self.map=nn.Parameter(mymaps)
        self.out_channels=out_channels
        self.in_channels=in_channels
        
    def Apply_MaxSig(self,img):  
        imgn=F.pad(input=img, pad=(2,2,2,2), mode='constant',value = 0.0)
        newimgn=torch.zeros_like(imgn)
        img_max=imgn.max()

        for i in range(2,imgn.shape[0]-2):
            for j in range(2,imgn.shape[1]-2):

                a=torch.flatten(torch.mul(self.weight,self.map))
                b=torch.flatten(imgn[i-2:i+3,j-2:j+3])
                
                siga=torch.sigmoid(a)
                
                sigb=torch.sigmoid(b/img_max)
               
                aa=torch.maximum(siga,sigb)
                max5=torch.sum(aa)        
                
                newimgn[i,j]=(24*sigb[12] - max5)
       
        return newimgn[2:-2,2:-2]#undo pad
 
    def forward(self, input_image):
        for i in range(self.out_channels):
            for j in range(self.in_channels):
                
                img=input_image[i,j]
                img=self.Apply_MaxSig(img)
                input_image[i,j]=img                
                
        return input_image

class build_simple_net(nn.Module):
    def __init__(self):
        super().__init__()
        self.s0 = MaxSigNet(1,1)

    def forward(self, inputs):
        return self.s0(inputs)

Hi guys. I solved this. maybe useful for somebody.

def forward(self, input_image):
        B=torch.nn.functional.unfold(torch.sigmoid(input_image), kernel_size=(5,5), padding=2, stride=1)
        B=B.reshape(input_image.shape[0],25,input_image.shape[2] , input_image.shape[3])
        C=torch.sigmoid(self.weight).reshape(1,25,1,1)
        D=torch.maximum(C,B) * self.map.reshape(1,25,1,1)
        E=torch.sum(D,axis=1)
        F=24*B[:,12,...] - E     
        return F.unsqueeze(dim=1)