Thanks for the information.
The single iteration might still give some bias.
I’ve added cudnn benachmarking to the script, but feel free to disable it:
import torch
import torch.nn.functional as F
import time
torch.backends.cudnn.benchmark = True
nb_iter = 1000
# dummy image
img = torch.randn([1, 1, 512, 768])
# average kernel
aveKernel = torch.ones([1, 2, 2]) / 2 ** 2
aveKernel = aveKernel[:, None, :, :]
# one channel CPU
tic = time.time()
for _ in range(nb_iter):
ave_img1 = F.conv2d(img, aveKernel, padding = 1, groups=1)
toc = time.time() - tic
print('CPU conv2d one channel time:', toc)
# one channel GPU
img = img.cuda()
aveKernel = aveKernel.cuda()
# warmup
for _ in range(50):
out = F.conv2d(img, aveKernel, padding=1, groups=1)
torch.cuda.synchronize()
tic = time.time()
for _ in range(nb_iter):
ave_img1 = F.conv2d(img, aveKernel, padding = 1, groups=1)
torch.cuda.synchronize()
toc = time.time() - tic
print('GPU conv2d one channel time:', toc)
# Three channel GPU
img = torch.randn([1, 3, 512, 768])
aveKernel = torch.ones([3, 2, 2]) / 2 ** 2
aveKernel = aveKernel[:, None, :, :]
img = img.cuda()
aveKernel = aveKernel.cuda()
# warmup
for _ in range(50):
out = F.conv2d(img, aveKernel, padding=1, groups=3)
torch.cuda.synchronize()
tic = time.time()
for _ in range(nb_iter):
ave_img1 = F.conv2d(img, aveKernel, padding = 1, groups=3)
torch.cuda.synchronize()
toc = time.time() - tic
print('GPU conv2d three channel time:', toc)
> CPU conv2d one channel time: 4.251241207122803
> GPU conv2d one channel time: 0.04510688781738281
> GPU conv2d three channel time: 0.032311201095581055