Why I got bad performance when I used DDP?

Here is my code:

    if opt.dist_gpu:
        from torch.distributed import distributed_c10d as dist
        rank = int(os.getenv('RANK', 0))
        world_size = int(os.getenv('WORLD_SIZE', 1))
        local_rank = int(os.getenv('LOCAL_RANK', 0))

        print('rank is %d, world_size is %d'%(rank, world_size))

        torch.cuda.set_device(local_rank)
        dist.init_process_group(
            backend="nccl",
            init_method=opt.dist_url,
            world_size=world_size,
            rank=rank,
        )
        dist.barrier()
        opt.gpu_ids = range(world_size)
        print("cfg.gpu_ids:", opt.gpu_ids)
        device = torch.device(f'cuda:{local_rank}' if torch.cuda.is_available() else 'cpu')
    if opt.dist_gpu:
        train_sampler = DistributedSampler(dataset=opencood_train_dataset, shuffle=True)
        train_batch_sampler = torch.utils.data.BatchSampler(
            train_sampler, hypes['train_params']['batch_size'], drop_last=True)

        train_loader = DataLoader(opencood_train_dataset,
                                  num_workers=4,
                                  collate_fn=opencood_train_dataset.collate_batch_train,
                                  pin_memory=True,
                                  prefetch_factor=2,
                                  batch_sampler=train_batch_sampler)  

        val_sampler = DistributedSampler(dataset=opencood_validate_dataset, shuffle=False)  
        val_loader = DataLoader(opencood_validate_dataset,
                                batch_size=hypes['train_params']['batch_size'],
                                num_workers=4,
                                collate_fn=opencood_train_dataset.collate_batch_train,
                                pin_memory=True,
                                drop_last=False,
                                prefetch_factor=2,
                                sampler=val_sampler) 
    if opt.dist_gpu:
        print('distributed training')
        model = model.cuda(local_rank)
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], find_unused_parameters=True)

It’s so strange, because when I trained in a single GPU, performance is normal, but in DDP training, model is very terrible. I don’t know why.