DDP with RuntimeError : Detected mismatch between collectives on ranks

Hi I have a problem for running my model with DDP using 6 gpus. I got an error message with RuntimeError: Detected mismatch between collectives on ranks. Rank 4 is running inconsistent collective:
Full error message(with TORCH_DISTRIBUTED_DEBUG=“DETAIL”) is like below.

INFO:root:Reducer buckets have been rebuilt in this iteration.
Traceback (most recent call last):
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
    train(model,
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
    train(model,
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
    epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
    grad_scaler.scale(loss).backward()
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
    epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 188, in train_epoch
    dist.barrier()
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier
    train(model,
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
    epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
    grad_scaler.scale(loss).backward()
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
    train(model,
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
    epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
    grad_scaler.scale(loss).backward()
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
    train(model,
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
    epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
    grad_scaler.scale(loss).backward()
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
    train(model,
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
    epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
  File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
    grad_scaler.scale(loss).backward()
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
    work = default_pg.barrier(opts=opts)
RuntimeError: Detected mismatch between collectives on ranks. Rank 1 is running inconsistent collective: CollectiveFingerPrint(OpType=BARRIER
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 2 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 4 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 3 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 0 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 5 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1724344) of binary: /home/dngusdnr1/anaconda3/envs/se3_113/bin/python
Traceback (most recent call last):
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/run.py", line 728, in <module>
    main()
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
    return f(*args, **kwargs)
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/run.py", line 724, in main
    run(args)
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/run.py", line 715, in run
    elastic_launch(
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
se3_transformer/runtime/tmp_work.py FAILED
------------------------------------------------------------
Failures:
[1]:
  time      : 2022-12-16_13:37:41
  host      : nova005.seoklab.org
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 1724345)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
  time      : 2022-12-16_13:37:41
  host      : nova005.seoklab.org
  rank      : 2 (local_rank: 2)
  exitcode  : 1 (pid: 1724346)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
  time      : 2022-12-16_13:37:41
  host      : nova005.seoklab.org
  rank      : 3 (local_rank: 3)
  exitcode  : 1 (pid: 1724347)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[4]:
  time      : 2022-12-16_13:37:41
  host      : nova005.seoklab.org
  rank      : 4 (local_rank: 4)
  exitcode  : 1 (pid: 1724348)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[5]:
  time      : 2022-12-16_13:37:41
  host      : nova005.seoklab.org
  rank      : 5 (local_rank: 5)
  exitcode  : 1 (pid: 1724349)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2022-12-16_13:37:41
  host      : nova005.seoklab.org
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 1724344)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Only rank 1 is in dist.barrier and the other ranks are in grad_scaler.scale(loss).backward() in train_epoch function.

My code is constructed like below.

def train_epoch(...):
    epoch_loss = initialize_epoch_loss()
    loss_fn=HU_Loss(se3_config.loss)
    print ("Lentgh of dataloader!",get_local_rank(),len(train_dataloader)) # check all local_rank have equal number of batch
    for i, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), unit='batch',
                         desc=f'Epoch {epoch_idx}', disable=(True or local_rank != 0)):
        node_dat,dist_dat = to_cuda(batch[:2])
        #t1=time.time()
        with torch.no_grad():
            test=gen_graph(node_dat,dist_dat)
        for callback in callbacks:
            callback.on_batch_start()
        local_rank=get_local_rank()
        dist.barrier()
        print(f'Going {local_rank} {i} ') # check each process working on same step.
        with torch.cuda.amp.autocast(enabled=args.amp):
            pred = model(...)
            loss,loss_dic=loss_fn(...)
            loss = loss/args.accumulate_grad_batches
        grad_scaler.scale(loss).backward() # Other ranks are here when error was raised.
        epoch_loss=update_epoch_loss(epoch_loss,loss_dic)
        # gradient accumulation
        if (i + 1) % args.accumulate_grad_batches == 0 or (i + 1) == len(train_dataloader):
            if args.gradient_clip:
                grad_scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.gradient_clip)
        grad_scaler.step(optimizer)
        grad_scaler.update()
        optimizer.zero_grad()
        dist.barrier() #This barrier is the barrier mentioned in error log, on which the rank 1 is working 
    epoch_loss=finalize_epoch_loss(epoch_loss)
    return epoch_loss

When I print length of dataloader for each local_rank. I confirmed that all local_rank has equal number of batch.

Lentgh of dataloader! 1 5285
Lentgh of dataloader! 0 5285
Lentgh of dataloader! 5 5285
Lentgh of dataloader! 2 5285
Lentgh of dataloader! 3 5285
Lentgh of dataloader! 4 5285

And there is no other error message such as NaN loss.
Where should I start to solve this problem?

Hey @Hyeonuk_Woo, does your model/program launches any collective communications on its own (except the ones launched automatically by DDP). DDP should have already normalized all communications, so this error seems to suggests there are other comms launched outside of DDP.