I am measuring the time consuming between normal convolution 2d (nn.Conv2d
) with kernel 3x3
and the its decompose version Conv2d
with size of 1x3 followed by 3x1. I wrote a simple test as below but the result shows very inaccurately or unstable. Could you look at my code and profile result and let me know why first convolution often takes a lot of time. Note that, If I call Net2 first, then the first convolution also very big time. How to make it realiable? Thanks
import torch
import torch.nn as nn
class Net1(nn.Module):
def __init__(self):
super(Net1, self).__init__()
self.conv1 = nn.Conv2d(1, 16, 3, padding =1, stride =1)
def forward(self, x):
x = self.conv1(x)
return x
class My_Conv(nn.Module):
def __init__(self):
super(My_Conv, self).__init__()
self.conv1_3 = nn.Conv2d(1, 16, kernel_size=(1,3), padding=(0,1), stride=1)
self.conv3_1 = nn.Conv2d(16, 16, kernel_size=(3, 1), padding=(1, 0), stride=1)
def forward(self, x):
x = self.conv3_1(self.conv1_3(x))
return x
class Net2(nn.Module):
def __init__(self):
super(Net2, self).__init__()
self.conv1 = My_Conv()
def forward(self, x):
x = self.conv1(x)
return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net1 = Net1().to(device)
net2 = Net2().to(device)
input = torch.randn((1, 1, 32, 32), requires_grad=True).to(device)
with torch.autograd.profiler.profile(use_cuda=True) as prof:
out1 = net1(input)
out2 = net2(input)
#out.backward()
print(prof)
And this is log
--------------------- --------------- --------------- --------------- --------------- ---------------
Name CPU time CUDA time Calls CPU total CUDA total
--------------------- --------------- --------------- --------------- --------------- ---------------
conv2d 1850.410us 1820.672us 1 1850.410us 1820.672us
convolution 1836.967us 1815.552us 1 1836.967us 1815.552us
_convolution 1827.180us 1809.408us 1 1827.180us 1809.408us
contiguous 4.810us 3.072us 1 4.810us 3.072us
empty 2.631us 2.048us 1 2.631us 2.048us
cudnn_convolution 1798.872us 1794.048us 1 1798.872us 1794.048us
conv2d 71.387us 43.008us 1 71.387us 43.008us
convolution 67.828us 38.912us 1 67.828us 38.912us
_convolution 64.698us 32.768us 1 64.698us 32.768us
contiguous 1.621us 3.072us 1 1.621us 3.072us
empty 2.248us 3.072us 1 2.248us 3.072us
cudnn_convolution 51.926us 16.416us 1 51.926us 16.416us
conv2d 49.707us 47.104us 1 49.707us 47.104us
convolution 46.475us 43.008us 1 46.475us 43.008us
_convolution 43.718us 37.888us 1 43.718us 37.888us
contiguous 1.502us 2.048us 1 1.502us 2.048us
empty 1.982us 2.048us 1 1.982us 2.048us
cudnn_convolution 32.296us 21.504us 1 32.296us 21.504us