Hello, I am implementing depthwise convolution used in MobileNet through matrix multiplication.
I need to use unfold function due to some window-wise operations after this implementation.
When I compared the results of the nn.functional.conv2d function and the function I implemented, I found a small difference.
Why does this difference occur? And How can I eliminate this difference?
First, I divided the input tensor and weight tensor into channels and performed operations for each channel.
I unfolded one channel of the input tensor and performed the matrix multiplication operation with the corresponding weight tensor for each window.
Then, I folded the output tensor of one channel and concatenated it into the output tensor.
The following is the depth-wise convolution function I implemented.
import torch import torch.nn.functional as F def depthwise_conv2d_matmul(input, weight, bias=None, stride=1, padding=0, dilation=1): bsz, channels, h, w = input.shape k_channels, _, k_h, k_w = weight.shape assert h == w, 'Input tensor must be square' assert channels == k_channels, 'Number of input channels and kernel channels must match' assert k_h == k_w, 'Kernel must be square' input_size = h kernel_size = k_h # Split the input tensor and weight tensor along the channel dimension input_splits = input.split(1, dim=1) weight_splits = weight.split(1, dim=0) output_splits =  for i in range(channels): # Unfold the input, input_unf.shape: torch.Size([bsz, kernel_size*kernel_size, window_size]) input_unf = F.unfold(input_splits[i], weight_splits[i].shape[-2:], dilation=dilation, padding=padding, stride=stride) # Perform depth-wise convolution # input_unf.transpose(1, 2) shape: torch.Size([bsz, window_size, kernel_size*kernel_size]) # weight_splits[i].view(weight_splits[i].shape, -1).t() shape: torch.Size([kernel_size*kernel_size, 1]) # out_unf.shape: torch.Size([bsz, 1, window_size]) out_unf = input_unf.transpose(1, 2).matmul( weight_splits[i].view(weight_splits[i].shape, -1).t() ).transpose(1, 2) # If bias is not None, add bias if bias is not None: out_unf += bias[i].view(1, -1, 1) # Fold the output tensor and add it to the list of output splits combined_out = F.fold(out_unf, (input_size + (2 * padding) - (dilation * (kernel_size - 1)) - 1) // stride + 1, (1, 1)) output_splits.append(combined_out) # Concatenate the output splits along the channel dimension to get the final output return torch.cat(output_splits, dim=1) if __name__ == '__main__': input = torch.randn(2, 3, 5, 5) weight = torch.randn(3, 1, 3, 3) # Perform depth-wise convolution using matrix multiplication output = depthwise_conv2d_matmul(input, weight, stride=1, padding=1) # Verify the result by comparing with torch.nn.functional.conv2d output_builtin = F.conv2d(input, weight, groups=input.shape, stride=1, padding=1) # Outputs a small number close to 0 if the implementation is correct print((output - output_builtin).abs().max()) # print(output - output_builtin)