Hi all, currently I’m trying to implement Group_Convolution by dividing one Convolution into 4 sub-parts and send them to 4 different GPUs to reduce the inference time of the model. I use multiple threading to expect them to run concurrently. However, when I check the operations inside GPU by nvidia-smi command, the data still transfers from GPU0 to GPU 1,2,3 and implement sequentially, not parallel. Can you help me to correct this? Thank you
Here is my code for group convolution:
class Residual(nn.Module):
def __init__(self, in_channels, out_channels, dev0, dev1, dev2, dev3, down_sample = False, decouple = False):
super(Residual, self).__init__()
self.dev0 = dev0
self.dev1 = dev1
self.dev2 = dev2
self.dev3 = dev3
self.down_sample = down_sample
self.decouple = decouple
# Try testing with hardcode for threading case (test for conv(258, 512))
self.y0 = torch.zeros((100, 128, 7, 7), device = self.dev0)
self.y1 = torch.zeros((100, 128, 7, 7), device = self.dev1)
self.y2 = torch.zeros((100, 128, 7, 7), device = self.dev2)
self.y3 = torch.zeros((100, 128, 7, 7), device = self.dev3)
# End hardcode (testing purpose)
if (in_channels == out_channels):
if (self.decouple): # Check Grouped Convolution or NOT
self.conv1a = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev0)
self.conv1b = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev1)
self.conv1c = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev2)
self.conv1d = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev3)
else:
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride =1, padding = 1, bias = False).to(self.dev0)
else:
if (self.decouple):
self.conv1a = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev0)
self.conv1b = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev1)
self.conv1c = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev2)
self.conv1d = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev3)
else:
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride =2, padding = 1, bias = False).to(self.dev0)
...
def Group_Conv(self, device, in_tensor, out_tensor):
out_tensor = nn.Conv2d(in_channel, out_channel, kernel_size = 3, stride =2, padding =1, bias = False).to(device)
def forward(self, x):
if (self.decouple):
a = torch.chunk(x, 4, dim = 1) # Devide feature maps into 4 sub-part following the channel
# GConv() function for 4 devices concurrently
Thread(target = self.Group_Conv(self.dev0, a[0], self.y0)).start()
Thread(target = self.Group_Conv(self.dev1, a[1], self.y1)).start()
Thread(target = self.Group_Conv(self.dev2, a[2], self.y2)).start()
Thread(target = self.Group_Conv(self.dev3, a[3], self.y3)).start()
out = torch.cat([self.y0, self.y1.to(self.dev0), self.y2.to(self.dev0), self.y3.to(self.dev0)], dim = 1)
else:
out = self.conv1(x)
....