Why model parallel training on two gpus will cause dead lock?

I tried to seperate mobilenetv2 to three parts and train them on 2gpus(one is client one is server).
I use distributed and nccl as my backend. And dist.isend dist.recv is my sending apis.
here is my code(you could run it on your machine.


import torch
import torch.nn as nn
import torch.multiprocessing as mp
import torch.distributed as dist
import torch.nn.functional as F
from torchvision.models import mobilenet_v2
class Reshape1(nn.Module):
    def __init__(self):
        super(Reshape1, self).__init__()
        pass

    def forward(self, x):
        out = F.relu(x)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        return out
def main_worker(rank,world_size,args):
    dist.init_process_group(backend="nccl", init_method='tcp://18.25.6.30:9001',
                            world_size=world_size, rank=rank)
    if rank == 0:
        model = mobilenet_v2(pretrained=True)
        model.classifier[-1] = nn.Linear(1280, 10)
        layer1 = [model.features[0]]
        layer3 = [Reshape1(), model.classifier]
        
        layer1 = nn.Sequential(*layer1)
        layer3= nn.Sequential(*layer3)
        layer1 =layer1.to(0)
        layer3 = layer3.to(0)
        input = torch.rand([32,3,224,224]).to(rank).requires_grad_()
        recv1 = torch.zeros([16,1280,7,7]).to(rank).requires_grad_()
        recv2 = torch.zeros([16,1280,7,7]).to(rank).requires_grad_()
        # bandwidth_avg = 0.0
        input = input.chunk(2)
        # recv = recv.chunk(2)
        # for chunk in range(2):
        output = layer1(input[0])
        dist.isend(output,1)
        output = layer1(input[1])
        dist.isend(output,1)
        print("client iter 1")
        dist.recv(recv1,1)
        print("client recv")
        output = layer3(recv1)
        print("cal over")
        dist.recv(recv2,1)
        output = layer3(recv2)

    else:
        model = mobilenet_v2(pretrained=True)
        layer2 = [model.features[1:]]
        layer2 = nn.Sequential(*layer2)
        layer2 = layer2.to(rank)
        # input = torch.rand([32,32,112,112]).to(rank).type(torch.float16)
        recv1 = torch.rand([16,32,112,112]).to(rank).requires_grad_()
        recv2 = torch.rand([16,32,112,112]).to(rank).requires_grad_()
        # input = input.chunk(2)
        # recv = recv.chunk(2)
        dist.recv(recv1,0)
        output = layer2(recv1)
        dist.isend(output,0)
        dist.recv(recv2,0)
        output = layer2(recv2)
        dist.isend(output,0)
    print(rank)

        
  

            

def main():
    # args = parser.parse_args()
    mp.spawn(main_worker,nprocs= 2,args=(2,2))
    pass


if __name__ == '__main__':
    main()

The second rank process could run right and close, however the firsrt can not.

I could draw a graph for this
I could draw a graph for you.
time line ---------------------------------------------------------------------------------------------
gpu0: input[0]->layer1->output(isend)
-------------------------------------- input[1]->layer1->output(isend)
---------------------------------------------------------------------- recv1(recv)->hangs(x)(:rage:)->layer3
gpu1:
------------------------------------- recv1 → layer2->output(isend)
--------------------------------------------------------------------recv2 → layer2->output(isend) over(:smile: