Basic CNN from scratch - running too slow

maamli · May 14, 2020, 4:50am

I have coded up a basic CovNet from scratch (1 conv, 1 pool, and 2 fc) and it is running too slow. I was looking to verify if I am doing it right here and what can be done to speed it up:

The individual functions of conv_forward and pool_forward works in numpy.

class ConvNet(nn.Module):
  def __init__(self, _filters, biases, dim_in_fc1, dim_out_fc1, dim_out_fc2):
    super(ConvNet, self).__init__()
    self._filters = nn.Parameter(torch.randn(_filters))
    self.biases = nn.Parameter(torch.randn(biases))
    self.W1 = nn.Parameter(torch.rand(dim_in_fc1, dim_out_fc1, dtype=torch.float))
    self.b1 = nn.Parameter(torch.zeros(dim_out_fc1))
    self.W2 = nn.Parameter(torch.rand(dim_out_fc1, dim_out_fc2, dtype=torch.float))
    self.b2 = nn.Parameter(torch.zeros(dim_out_fc2))
 
  def _conv_forward(self, Activation_prev_layer, _filters, biases, stride_pad_dict):
    m, ch, a, b = Activation_prev_layer.size()
    ch, f, f, num = _filters.size()

    stride = stride_pad_dict["stride"]
    pad = stride_pad_dict["pad"]

    new_a = (a+2*pad-f)//stride + 1
    new_b = (b+2*pad-f)//stride + 1
    
    z = torch.zeros((m, num, new_a, new_b))

    Activation_prev_layer_pad = padding(Activation_prev_layer, pad)
    for i in range(m):
      Activation_prev_layer_pad_slice = Activation_prev_layer_pad[i]
      for x in range(new_a):
        for y in range(new_b):
          for n in range(num):
            x_start = stride*x
            x_end = stride*x + f
            y_start = stride*y
            y_end = stride*y + f
            z[i, n, x, y] = conv_step(Activation_prev_layer_pad_slice[:, x_start:x_end, y_start:y_end], _filters[:, :, :, n], biases[:, :, :, n])

    cache = (Activation_prev_layer, _filters, biases, stride_pad_dict)

    return z 

  def _pool_forward(self, Activation_prev_layer, stride_pool_dict, mode="max"):
    m, ch, a, b = Activation_prev_layer.size()
    stride = stride_pool_dict["stride"]
    pool = stride_pool_dict["pool"]

    new_a = (a-pool)//stride + 1
    new_b = (b-pool)//stride + 1

    z = torch.zeros((m, ch, new_a, new_b))

    for i in range(m):
      Activation_prev_layer_slice = Activation_prev_layer[i]
      for x in range(new_a):
        for y in range(new_b):
          for c in range(ch):
            x_start = stride*x
            x_end = stride*x + pool
            y_start = stride*y
            y_end = stride*y + pool
            Activation_prev_layer_slice_pool = Activation_prev_layer_slice[c, x_start:x_end, y_start:y_end]
            if mode=="max":
              z[i, c, x, y] =  torch.max(Activation_prev_layer_slice_pool)
            elif mode=="average":
              z[i, c, x, y] =  torch.mean(Activation_prev_layer_slice_pool)

    cache = (Activation_prev_layer, stride_pool_dict)

    return z


  def fc(self, x, W, b):
    z = torch.add(torch.mm(x,W), b)
    a = f.log_softmax(z)

    return a

  def forward_feed(self, x, stride_pad_dict, stride_pool_dict):
    m = x.size()[0]
    x = f.relu(self._conv_forward(x, self._filters, self.biases, stride_pad_dict))
    x = self._pool_forward(x, stride_pool_dict)
    x = x.view(m, -1)
    x = self.fc(x, self.W1, self.b1)
    x = self.fc(x, self.W2, self.b2)
    return x

I am running the loop here:

model = ConvNet((1, 3, 3, 8), (1,1,1, 8), 1800,50,10)
stride_pad_dict = {"stride": 1, "pad": 2}
stride_pool_dict = {"stride": 2, "pool":2}

epochs = 10
train = torch.optim.SGD(params=model.parameters(), lr=0.01)

for i in range(epochs):
  for batch_idx, (image, target) in enumerate(mnist_train_loader):
      train.zero_grad()
      pred_y = model.forward_feed(image, stride_pad_dict, stride_pool_dict)
      loss = f.nll_loss(pred_y, target)
      loss.backward()
      train.step()
      print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(i, batch_idx * len(image), len(mnist_train_loader.dataset),100. * batch_idx / len(mnist_train_loader), loss.item()))

ptrblck · May 14, 2020, 6:02am

Nested loops are expected to be slow.
You could use an im2col approach and matrix multiplications for the manual convolution approach.
What id your use case for this code?
If it’s a demo code to show the underlying operations of conv layers (to students) I wouldn’t care too much about the performance.

maamli · May 14, 2020, 7:12am

Yes its for demonstration purposes. Thanks for the response. I am new to the community. Is there a way to check out the Pytorch Conv2d implementation in nn.Module?

ptrblck · May 14, 2020, 7:29am

You can find the native implementatiojn here. Note that depending on the used device and setup, different implementations and backends will be used as seen here.

maamli · May 14, 2020, 7:33am

Thanks a ton for this

blackberry · May 15, 2020, 12:41am

I think, the main reason is you are running it on CPU.
To run it on GPU:

model = model.cuda()

Second, so many loops, as pointed out by Peter.