Trouble with implementation Convolution on multiple GPUs parallel

Hi all, currently I’m trying to implement Group_Convolution by dividing one Convolution into 4 sub-parts and send them to 4 different GPUs to reduce the inference time of the model. I use multiple threading to expect them to run concurrently. However, when I check the operations inside GPU by nvidia-smi command, the data still transfers from GPU0 to GPU 1,2,3 and implement sequentially, not parallel. Can you help me to correct this? Thank you

Here is my code for group convolution:

class Residual(nn.Module):
    def __init__(self, in_channels, out_channels, dev0, dev1, dev2, dev3, down_sample = False, decouple = False):
        super(Residual, self).__init__()
        self.dev0 = dev0
        self.dev1 = dev1
        self.dev2 = dev2
        self.dev3 = dev3

        self.down_sample = down_sample
        self.decouple = decouple

        # Try testing with hardcode for threading case (test for conv(258, 512))
        self.y0 = torch.zeros((100, 128, 7, 7), device = self.dev0)
        self.y1 = torch.zeros((100, 128, 7, 7), device = self.dev1)
        self.y2 = torch.zeros((100, 128, 7, 7), device = self.dev2)
        self.y3 = torch.zeros((100, 128, 7, 7), device = self.dev3)
        # End hardcode (testing purpose)

        if (in_channels == out_channels):
            if (self.decouple): # Check Grouped Convolution or NOT
                self.conv1a = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev0)
                self.conv1b = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev1)
                self.conv1c = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev2)
                self.conv1d = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev3)
            else:
                self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride =1, padding = 1, bias = False).to(self.dev0)
        else:
            if (self.decouple):
                self.conv1a = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev0)
                self.conv1b = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev1)
                self.conv1c = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev2)
                self.conv1d = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev3)
            else:
                self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride =2, padding = 1, bias = False).to(self.dev0)
                                            ...
def Group_Conv(self, device, in_tensor, out_tensor):
    out_tensor = nn.Conv2d(in_channel, out_channel, kernel_size = 3, stride =2, padding =1, bias = False).to(device)

def forward(self, x):
        if (self.decouple):
            a = torch.chunk(x, 4, dim = 1)  # Devide feature maps into 4 sub-part following the channel
            # GConv() function for 4 devices concurrently 
            Thread(target = self.Group_Conv(self.dev0, a[0], self.y0)).start()
            Thread(target = self.Group_Conv(self.dev1, a[1], self.y1)).start()
            Thread(target = self.Group_Conv(self.dev2, a[2], self.y2)).start()
            Thread(target = self.Group_Conv(self.dev3, a[3], self.y3)).start()

            out = torch.cat([self.y0, self.y1.to(self.dev0), self.y2.to(self.dev0), self.y3.to(self.dev0)], dim = 1)
              
        else:
            out = self.conv1(x)
                                            ....