Got nan loss when using dist.send and dist.recv

I have been trying use tcp connection and mp.spawn for using two GPUs for model parallel. But after execute some batches(about 4) in CIFAR10, the output of two sub-models becomes nan. Why this has happened? When I do the training in one gpu, it is fine.
code for send and recv

class RemoteSend(autograd.Function):
    @staticmethod
    def forward(ctx, input: torch.tensor, to_rank: int, from_rank: int,self_rank:int ):
        dist.send((torch.tensor(input.dim())*torch.tensor(1.0)).to(self_rank), to_rank)
        dist.send((torch.tensor(input.size())*torch.tensor(1.0)).to(self_rank), to_rank)
        print("forward send size",torch.tensor(input.size())*1.0)
        dist.send(input, to_rank)
        to_rank = torch.tensor(to_rank)
        from_rank = torch.tensor(from_rank)
        ctx.save_for_backward(to_rank, from_rank)
        return input

    @staticmethod
    def backward(ctx, grad_output):
        to_rank, from_rank = ctx.saved_tensors

        dist.recv(grad_output, int(from_rank))
        return grad_output


class RemoteReceive(autograd.Function):
    @staticmethod
    def forward(ctx, to_rank: int = 0, from_rank: int = 1,self_rank: int = 0):
        ##print(to_rank, from_rank)
        dim = torch.tensor(1.0).to(self_rank)
        dist.recv(dim, from_rank)
        #print("forward recv dim", dim)
        size = torch.rand(int(dim)).to(self_rank)
        dist.recv(size, from_rank)
        #print("forward recv size",size)
        x = torch.zeros(tuple(size.int())).to(self_rank)
        #print("try to get input with size", x.size())
        dist.recv(x, from_rank)
        #print("forward recv finish")
        to_rank = torch.tensor(to_rank)
        from_rank = torch.tensor(from_rank)
        ctx.save_for_backward(to_rank, from_rank)
        return x

    @staticmethod
    def backward(ctx, grad_output, to_rank: int = 0):
        to_rank, from_rank = ctx.saved_tensors
        dist.send(torch.tensor(grad_output.dim()*1.0), int(to_rank))
        dist.send(torch.tensor(grad_output.size())*1.0, int(to_rank))
        dist.send(grad_output, int(to_rank))

        return None

These are code that when you apply them it send tensors to other gpus in forward and get tensors in backward.
And training code are as below.

def main_worker(gpu,world_size):
    dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:1224',
                            world_size=2, rank=gpu)
    transform = transforms.Compose(

        [transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=16,
                                            shuffle=True, num_workers=12)

    testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=16,
                                            shuffle=False, num_workers=12)
    model =torchvision.models.MobileNetV2(num_classes = 10)
    if gpu == 0:
        
        model1 = model.features[0:5]
        criterion = nn.CrossEntropyLoss().cuda(0)
        optimizer1 =torch.optim.SGD(model1.parameters(),lr = 0.1,weight_decay= 1e-4)
        model1.cuda(0)
        for i ,(images,target) in enumerate(trainloader):
            images = images.cuda(0,non_blocking =True)
            output =model1(images)
            #print(output)
            RemoteSend.apply(output,1,1,0)
            optimizer1.zero_grad()
            output.backward(output)
            optimizer1.step()
    if gpu ==1 :
        model2 = nn.Sequential(model.features[5:],Reshape(-1,1280),model.classifier) 
        criterion = nn.CrossEntropyLoss().cuda(1)
        optimizer2 =torch.optim.SGD(model2.parameters(),lr = 0.1,weight_decay= 1e-4)
        model2.cuda(1)
        for i ,(images,target) in enumerate(trainloader):
            target = target.cuda(1,non_blocking =True)
            input = torch.tensor([16,32,4,4])
            input = RemoteReceive.apply(0,0,1)
            input = input.cuda(1)
            #print(input)
            output = model2(input)
            #print(output)
            loss = criterion(output, target)
            optimizer2.zero_grad()
            loss.backward()
            optimizer2.step()
            print(loss)

I divided MobileNetV2 into two parts.
The first 4 batches the model get normal resputs, after that, loss and output of model becomes nan.
Here are settings
Dataset:CIFAR10
lr:0.1
batchsize: 512
imagesize:32*32
I don’t believe these settings are related to nan, since I have tried on one GPU

Thanks for posting @Maxwell_Albert Looking into your problem, I couldn’t find anything suspicious yet. But I think potentially this might potentially be related to how the backward runs across two GPUs.

Since you want to do model parallel across two GPUs and split two models. Maybe you want to consider using RPC to do model parallel? RPC and Distributed Autograd could hook up the autograd graph implicitly without user writing custom send/recv functions, this might can resolve the nan loss issue that you met. Getting Started with Distributed RPC Framework — PyTorch Tutorials 1.10.1+cu102 documentation

Thank you, I found my problem. The reason is that I put output into output backward function.
And thanks for the suggestion, but rpc is memory costing. It copy all parts of model and is not pure model parallel.