Basic CNN from scratch - running too slow

I have coded up a basic CovNet from scratch (1 conv, 1 pool, and 2 fc) and it is running too slow. I was looking to verify if I am doing it right here and what can be done to speed it up:

The individual functions of conv_forward and pool_forward works in numpy.

class ConvNet(nn.Module):
  def __init__(self, _filters, biases, dim_in_fc1, dim_out_fc1, dim_out_fc2):
    super(ConvNet, self).__init__()
    self._filters = nn.Parameter(torch.randn(_filters))
    self.biases = nn.Parameter(torch.randn(biases))
    self.W1 = nn.Parameter(torch.rand(dim_in_fc1, dim_out_fc1, dtype=torch.float))
    self.b1 = nn.Parameter(torch.zeros(dim_out_fc1))
    self.W2 = nn.Parameter(torch.rand(dim_out_fc1, dim_out_fc2, dtype=torch.float))
    self.b2 = nn.Parameter(torch.zeros(dim_out_fc2))
 
  def _conv_forward(self, Activation_prev_layer, _filters, biases, stride_pad_dict):
    m, ch, a, b = Activation_prev_layer.size()
    ch, f, f, num = _filters.size()

    stride = stride_pad_dict["stride"]
    pad = stride_pad_dict["pad"]

    new_a = (a+2*pad-f)//stride + 1
    new_b = (b+2*pad-f)//stride + 1
    
    z = torch.zeros((m, num, new_a, new_b))

    Activation_prev_layer_pad = padding(Activation_prev_layer, pad)
    for i in range(m):
      Activation_prev_layer_pad_slice = Activation_prev_layer_pad[i]
      for x in range(new_a):
        for y in range(new_b):
          for n in range(num):
            x_start = stride*x
            x_end = stride*x + f
            y_start = stride*y
            y_end = stride*y + f
            z[i, n, x, y] = conv_step(Activation_prev_layer_pad_slice[:, x_start:x_end, y_start:y_end], _filters[:, :, :, n], biases[:, :, :, n])

    cache = (Activation_prev_layer, _filters, biases, stride_pad_dict)

    return z 

  def _pool_forward(self, Activation_prev_layer, stride_pool_dict, mode="max"):
    m, ch, a, b = Activation_prev_layer.size()
    stride = stride_pool_dict["stride"]
    pool = stride_pool_dict["pool"]

    new_a = (a-pool)//stride + 1
    new_b = (b-pool)//stride + 1

    z = torch.zeros((m, ch, new_a, new_b))

    for i in range(m):
      Activation_prev_layer_slice = Activation_prev_layer[i]
      for x in range(new_a):
        for y in range(new_b):
          for c in range(ch):
            x_start = stride*x
            x_end = stride*x + pool
            y_start = stride*y
            y_end = stride*y + pool
            Activation_prev_layer_slice_pool = Activation_prev_layer_slice[c, x_start:x_end, y_start:y_end]
            if mode=="max":
              z[i, c, x, y] =  torch.max(Activation_prev_layer_slice_pool)
            elif mode=="average":
              z[i, c, x, y] =  torch.mean(Activation_prev_layer_slice_pool)

    cache = (Activation_prev_layer, stride_pool_dict)

    return z


  def fc(self, x, W, b):
    z = torch.add(torch.mm(x,W), b)
    a = f.log_softmax(z)

    return a

  def forward_feed(self, x, stride_pad_dict, stride_pool_dict):
    m = x.size()[0]
    x = f.relu(self._conv_forward(x, self._filters, self.biases, stride_pad_dict))
    x = self._pool_forward(x, stride_pool_dict)
    x = x.view(m, -1)
    x = self.fc(x, self.W1, self.b1)
    x = self.fc(x, self.W2, self.b2)
    return x

I am running the loop here:

model = ConvNet((1, 3, 3, 8), (1,1,1, 8), 1800,50,10)
stride_pad_dict = {"stride": 1, "pad": 2}
stride_pool_dict = {"stride": 2, "pool":2}

epochs = 10
train = torch.optim.SGD(params=model.parameters(), lr=0.01)

for i in range(epochs):
  for batch_idx, (image, target) in enumerate(mnist_train_loader):
      train.zero_grad()
      pred_y = model.forward_feed(image, stride_pad_dict, stride_pool_dict)
      loss = f.nll_loss(pred_y, target)
      loss.backward()
      train.step()
      print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(i, batch_idx * len(image), len(mnist_train_loader.dataset),100. * batch_idx / len(mnist_train_loader), loss.item()))

Nested loops are expected to be slow.
You could use an im2col approach and matrix multiplications for the manual convolution approach.
What id your use case for this code?
If it’s a demo code to show the underlying operations of conv layers (to students) I wouldn’t care too much about the performance.

Yes its for demonstration purposes. Thanks for the response. I am new to the community. Is there a way to check out the Pytorch Conv2d implementation in nn.Module?

You can find the native implementatiojn here. Note that depending on the used device and setup, different implementations and backends will be used as seen here.

Thanks a ton for this :slight_smile:

I think, the main reason is you are running it on CPU.
To run it on GPU:

model = model.cuda()

Second, so many loops, as pointed out by Peter.