Alternatives to for-looping over a ModuleList

I have a neural network where the neurons of a single layer (layer 2 in the example) are arranged into some n number of “blocks”. I want to forward propagate through them individually to get to n outputs. Presently, I am doing that using ModuleList and a for loop to go over all the individual blocks. But the code is terribly slow because of this. What is a more efficient way to do this?

Any help would be appreciated, thanks a lot!

class small_net(nn.Module):

    def __init__(self, num_classes=10):
        super(small_net, self).__init__()

        self.bCov1 = BinarizeConv2d(3, 50, kernel_size=3, stride=1, padding=1,bias=True)
        self.bn1 = nn.BatchNorm2d(50, affine=False)
        self.htan1 = nn.Hardtanh(inplace=True)


        self.layer2_neuron_blocks = fixed_block_size(50, 10)
        self.layer2_convs = nn.ModuleList(
                    [BinarizeConv2d(i, 100, kernel_size=3, padding=1, bias=True) for i in self.layer2_neuron_blocks])
        self.mp2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.layer2_bns = nn.ModuleList([nn.BatchNorm2d(100, affine=False) for _ in self.layer2_neuron_blocks])
        
        
        self.fc3 = BinarizeLinear(100 * 16 * 16, 20, bias=True)
        self.bn3 = nn.BatchNorm1d(20, affine=False)
        self.htan3 = nn.Hardtanh(inplace=True)


        self.fc4 = BinarizeLinear(20, num_classes, bias=True)
        self.bn4 = nn.BatchNorm1d(num_classes, affine=False)
        
    def forward(self, x):
        x = self.bCov1(x)
        x = self.bn1(x)     
        x = SignActivation.apply(x)
        
        y2 = [0]+list(np.array(self.layer2_neuron_blocks).cumsum())

        out_layer2 = torch.zeros([x.shape[0], len(self.layer2_neuron_blocks), 100, 16, 16])

        for i in range(len(y2)-1):
            x2 = x[:, y2[i]:y2[i+1], :, :]
            x2 = self.layer2_convs[i](x2)
            x2 = self.mp2(x2)
            x2 = self.layer2_bns[i](x2)
            x2 = SignActivation.apply(x2)
            out_layer2[:, i, :, :, :]  = x2.clone()
        
        x2 = torch.mean(out_layer2, dim=1).to(device)

        x = x2.view(-1, 100 * 16 * 16)

        x = self.fc3(x)
        x = self.bn3(x)
        x = self.htan3(x)

        x = self.fc4(x)
        x = self.bn4(x)

        return x