Duplicate GPU detected : rank 1 and rank 0 both on CUDA device 1a000 when doing dist.all_gather_object on cpu dictionaries

I am running a multi gpu validation code, and I am able to have different gpus validate a set of data. Each process gets a dictionary of results on cpu. I want the rank 0 process to combine these dictionaries and then write it to disk.
I am not able to do this because it says duplicate GPU detected. I only have this issue if I am trying to combine the cpu dicts, if I remove that line it works. I have this issue even if I do it when it is on gpu.
My metrics logger also used dist.reduce() but on the tensor itself and has no issues.
How do I combine the dictionaries? I am using 2 gpus in this example.
Please find below some of the code:

def validate_dataset(
    model,
    data_loader,
    device,
    results_file,
    print_freq,
    cats,
    rank,
    world_size,
    output_dir,
):
    metric_logger = utils.MetricLogger(delimiter="  ")
    len_dataset = len(data_loader)
    first = True

    with torch.no_grad():
        model.eval()

        for idx, (images, targets, metadata) in enumerate(
            metric_logger.log_every(data_loader, print_freq, "Test")
        ):
            images = images.to(device)
            batch_size = len(images)

            # filter down to just what the loss needs
            limited_targets = [
                {
                    k: v.to(device)
                    for k, v in t.items()
                    if k
                    in (
                        "is_unknown",
                        "labels",
                        "class_targets",
                        "boxes",
                        "focus_scores",
                    )
                }
                for t in targets
            ]

            _, detections = model(images, limited_targets)

            coco_detections = utils.post_process_coco(detections, metadata)
            combined_detections = [None for _ in range(world_size)]
            torch.distributed.all_gather_object(combined_detections, coco_detections)

            if rank == 0:
                for coco_det in coco_detections:
                    if not first:
                        results_file.write(',')
                    else:
                        first = False

                    # Convert the data to a JSON string
                    json_str = json.dumps(coco_det)
                    # Write the JSON string to the file
                    results_file.write(json_str)

            dist.barrier()

If i comment out

# combined_detections = [None for _ in range(world_size)]
# torch.distributed.all_gather_object(combined_detections, coco_detections)

it works fine.

The other parts of the code where distributed is setup is here:

    # handle args
    args, unknown = parser.parse_known_args(argv)
    if len(unknown) > 0:
        logger.info("====== unused args ======\n")
        for x in unknown:
            logger.info(x)
        logger.info("\n====== unused args ======")

    # get gpu world details
    logger.info("Num GPUS Available: {}", torch.cuda.device_count())
    logger.info(
        "GPUS Available: {}",
        [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
    )
    world_size = len(args.gpus)

    if world_size > 1:
        # it can be pretty annoying to debug multithreaded
        torch.multiprocessing.spawn(
            main,
            args=(args, world_size),
            nprocs=world_size,
        )
    else:
        return main(0, args, world_size)


def main(rank, args, world_size):
    """
    Args:
        rank: Unique identifier of each process
       world_size: Total number of processes
    """
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = str(args.master_port)
    torch.distributed.init_process_group(
        backend="nccl", rank=rank, world_size=world_size
    )
    device = args.gpus[rank]

    log_glob = setup_logging(args.output_dir, job_type="test")
    if rank == 0:
        mkdir(args.output_dir)
        # write config to output
        config_file = render_config(args, os.path.join(args.output_dir, "test.ini"))

    # use fixed seed for all workers
    fixed_seed(42)
   
  ...............

  model.to(device)

    # setup ddp
    model = torch.nn.parallel.DistributedDataParallel(
        model,
        device_ids=[device],
        output_device=device,
        static_graph=True,
        gradient_as_bucket_view=True,
    )

    # load the model
    checkpoint = load_weights(args.model_weights)
    model = load_model_weights(model, checkpoint["model"])

    # run evaluation
    results_file = None
    if rank == 0:
        results_file = open("results.json", 'w')
        results_file.write('[')

    validate_dataset(model, data_loader, device, results_file, rank=rank, world_size=world_size, output_dir=args.output_dir, print_freq=args.print_freq, cats=cats)

def post_process_coco(detections, metadata):
    coco_dets = []

    for image_dets, image_meta in zip(detections, metadata):
        for box_idx in range(len(image_dets["boxes"])):
            curr_meta = image_meta[-1].frames[-1]
            coco_dets.append({
                "image_id": curr_meta.image_id,
                "bbox": image_dets["boxes"][box_idx].cpu().numpy().tolist(),
                "score": image_dets["scores"][box_idx].cpu().numpy().tolist(),
                "img_filename": curr_meta.file_name,
                "raw_img_filename": curr_meta.raw_file_name,
                "classification": image_dets["labels"][box_idx].cpu().numpy().tolist(),
                "category": image_dets["labels"][box_idx].cpu().numpy().tolist(),
                "sharpness": image_dets["sharpness"][box_idx].cpu().numpy().tolist(),
            })

    return coco_dets

Thank you in advance!