Hi I have a problem for running my model with DDP using 6 gpus. I got an error message with RuntimeError: Detected mismatch between collectives on ranks. Rank 4 is running inconsistent collective:
Full error message(with TORCH_DISTRIBUTED_DEBUG=“DETAIL”) is like below.
INFO:root:Reducer buckets have been rebuilt in this iteration.
Traceback (most recent call last):
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
Traceback (most recent call last):
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 307, in <module>
train(model,
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
train(model,
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
grad_scaler.scale(loss).backward()
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 188, in train_epoch
dist.barrier()
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier
train(model,
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
grad_scaler.scale(loss).backward()
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
train(model,
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
grad_scaler.scale(loss).backward()
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
train(model,
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
grad_scaler.scale(loss).backward()
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
train(model,
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 229, in train
epoch_loss=train_epoch(model,train_dataloader, loss_fn, epoch_idx, grad_scaler, optimizer, local_rank, callbacks, args)
File "/home/dngusdnr1/lib/thanks_seeun/graph_gen_on_gpu/se3_transformer/runtime/tmp_work.py", line 178, in train_epoch
grad_scaler.scale(loss).backward()
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/_tensor.py", line 363, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
work = default_pg.barrier(opts=opts)
RuntimeError: Detected mismatch between collectives on ranks. Rank 1 is running inconsistent collective: CollectiveFingerPrint(OpType=BARRIER
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 2 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 4 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 3 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 0 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: Detected mismatch between collectives on ranks. Rank 5 is running inconsistent collective: CollectiveFingerPrint(OpType=ALLREDUCE, TensorShape=[284972], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1724344) of binary: /home/dngusdnr1/anaconda3/envs/se3_113/bin/python
Traceback (most recent call last):
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/run.py", line 728, in <module>
main()
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/run.py", line 724, in main
run(args)
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/run.py", line 715, in run
elastic_launch(
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/dngusdnr1/anaconda3/envs/se3_113/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
se3_transformer/runtime/tmp_work.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2022-12-16_13:37:41
host : nova005.seoklab.org
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 1724345)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2022-12-16_13:37:41
host : nova005.seoklab.org
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 1724346)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2022-12-16_13:37:41
host : nova005.seoklab.org
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 1724347)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[4]:
time : 2022-12-16_13:37:41
host : nova005.seoklab.org
rank : 4 (local_rank: 4)
exitcode : 1 (pid: 1724348)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[5]:
time : 2022-12-16_13:37:41
host : nova005.seoklab.org
rank : 5 (local_rank: 5)
exitcode : 1 (pid: 1724349)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2022-12-16_13:37:41
host : nova005.seoklab.org
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1724344)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Only rank 1 is in dist.barrier
and the other ranks are in grad_scaler.scale(loss).backward()
in train_epoch
function.
My code is constructed like below.
def train_epoch(...):
epoch_loss = initialize_epoch_loss()
loss_fn=HU_Loss(se3_config.loss)
print ("Lentgh of dataloader!",get_local_rank(),len(train_dataloader)) # check all local_rank have equal number of batch
for i, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), unit='batch',
desc=f'Epoch {epoch_idx}', disable=(True or local_rank != 0)):
node_dat,dist_dat = to_cuda(batch[:2])
#t1=time.time()
with torch.no_grad():
test=gen_graph(node_dat,dist_dat)
for callback in callbacks:
callback.on_batch_start()
local_rank=get_local_rank()
dist.barrier()
print(f'Going {local_rank} {i} ') # check each process working on same step.
with torch.cuda.amp.autocast(enabled=args.amp):
pred = model(...)
loss,loss_dic=loss_fn(...)
loss = loss/args.accumulate_grad_batches
grad_scaler.scale(loss).backward() # Other ranks are here when error was raised.
epoch_loss=update_epoch_loss(epoch_loss,loss_dic)
# gradient accumulation
if (i + 1) % args.accumulate_grad_batches == 0 or (i + 1) == len(train_dataloader):
if args.gradient_clip:
grad_scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), args.gradient_clip)
grad_scaler.step(optimizer)
grad_scaler.update()
optimizer.zero_grad()
dist.barrier() #This barrier is the barrier mentioned in error log, on which the rank 1 is working
epoch_loss=finalize_epoch_loss(epoch_loss)
return epoch_loss
When I print length of dataloader for each local_rank. I confirmed that all local_rank has equal number of batch.
Lentgh of dataloader! 1 5285
Lentgh of dataloader! 0 5285
Lentgh of dataloader! 5 5285
Lentgh of dataloader! 2 5285
Lentgh of dataloader! 3 5285
Lentgh of dataloader! 4 5285
And there is no other error message such as NaN loss.
Where should I start to solve this problem?