Hi,
I was trying to use streams to speed up calling multiple Conv2d modules on the same GPU.
My code is below.
It doesn’t appear to run any quicker . There was a previous question asked last year about using streams. There was a suggestion that all ops should be run on non-default streams. I tried to accomplish this but my attempts don’t seem to have helped.
Is there any obvious problem with my approach?
Thanks
import torch
import torch.nn as nn
class ParallelDilatedConv(nn.Module):
def __init__(self, num_dilations, num_streams):
super(ParallelDilatedConv, self).__init__()
self.m = num_dilations
self.streams = [torch.cuda.Stream() for i in range(num_streams)]
self.module = nn.ModuleList([nn.Conv2d(1, 1, (3, 3), dilation=2**i, padding=2**i) for i in range(num_dilations)])
def forward(self, input):
res = []
for i in range(self.m):
with torch.cuda.stream(self.streams[i%len(self.streams)]):
res.append(self.module[i](input))
return torch.cat(res)
class DilatedConv(nn.Module):
def __init__(self, num_dilations):
super(DilatedConv, self).__init__()
self.m = num_dilations
self.module = nn.ModuleList([nn.Conv2d(1, 1, (3, 3), dilation=2**i, padding=2**i) for i in range(self.m)])
def forward(self, input):
res = []
for i in range(self.m):
res.append(self.module[i](input))
return torch.cat(res)
def time_loop(mod, num_iter, outstr):
start = time.time()
for i in range(num_iter):
mod(im).cpu()
end = time.time()
print(outstr.format(end-start))
if __name__ == '__main__':
import time
num_iter = 10
num_conv = 6
device = 'cuda:0'
num_streams = 6
s = torch.cuda.Stream()
with torch.cuda.stream(s):
im = torch.rand(1000, 1, 200, 200).to(device, non_blocking=True)
mod = DilatedConv(num_conv).to(device, non_blocking=True).share_memory()
time_loop(mod, num_iter, 'Sequential took {}')
mod = ParallelDilatedConv(num_conv, num_streams).to(device, non_blocking=True).share_memory()
time_loop(mod, num_iter, 'Parallel took {}')