Running this standard FSDP T5 example on a four GPU node gives following error. Any ideas what could be the issue?
Torch version: 2.0.1+cu117
Error:
--> current date and time of run = 2023-10-12-03:54:25_PM
r0 Training Epoch: 0%| | 0/94 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/T5_training.py", line 319, in <module>
fsdp_main(args)
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/T5_training.py", line 243, in fsdp_main
train_accuracy = train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/utils/train_utils.py", line 50, in train
output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"] )
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1709, in forward
encoder_outputs = self.encoder(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1123, in forward
layer_outputs = layer_module(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 695, in forward
self_attention_outputs = self.layer[0](
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 602, in forward
Traceback (most recent call last):
attention_output = self.SelfAttention(
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/T5_training.py", line 319, in <module>
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 561, in forward
scores += position_bias_masked
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/_tensor.py", line 1295, in __torch_function__
ret = func(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/_tensor/api.py", line 228, in __torch_dispatch__
fsdp_main(args)
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/T5_training.py", line 243, in fsdp_main
raise RuntimeError(
RuntimeError: aten.add_.Tensor: got mixed distributed and non-distributed tensors.
train_accuracy = train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/utils/train_utils.py", line 50, in train
output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"] )
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1709, in forward
encoder_outputs = self.encoder(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1123, in forward
layer_outputs = layer_module(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 695, in forward
self_attention_outputs = self.layer[0](
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 602, in forward
attention_output = self.SelfAttention(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 561, in forward
scores += position_bias_masked
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/_tensor.py", line 1295, in __torch_function__
ret = func(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/_tensor/api.py", line 228, in __torch_dispatch__
raise RuntimeError(
RuntimeError: aten.add_.Tensor: got mixed distributed and non-distributed tensors.
Traceback (most recent call last):
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/T5_training.py", line 319, in <module>
fsdp_main(args)
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/T5_training.py", line 243, in fsdp_main
train_accuracy = train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/utils/train_utils.py", line 50, in train
Traceback (most recent call last):
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/T5_training.py", line 319, in <module>
output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"] )
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1709, in forward
encoder_outputs = self.encoder(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
fsdp_main(args)
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/T5_training.py", line 243, in fsdp_main
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1123, in forward
layer_outputs = layer_module(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 695, in forward
train_accuracy = train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)self_attention_outputs = self.layer[0](
File "/home/ubuntu/efs/git/pytorch-examples/distributed/FSDP/utils/train_utils.py", line 50, in train
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 602, in forward
attention_output = self.SelfAttention(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 561, in forward
scores += position_bias_masked
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/_tensor.py", line 1295, in __torch_function__
ret = func(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/_tensor/api.py", line 228, in __torch_dispatch__
output = model(input_ids=batch["source_ids"],attention_mask=batch["source_mask"],labels=batch["target_ids"] )raise RuntimeError(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
RuntimeError: aten.add_.Tensor: got mixed distributed and non-distributed tensors.
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1709, in forward
encoder_outputs = self.encoder(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1123, in forward
layer_outputs = layer_module(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 748, in forward
output = self._fsdp_wrapped_module(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 695, in forward
self_attention_outputs = self.layer[0](
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 602, in forward
attention_output = self.SelfAttention(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 561, in forward
scores += position_bias_masked
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/_tensor.py", line 1295, in __torch_function__
ret = func(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/_tensor/api.py", line 228, in __torch_dispatch__
raise RuntimeError(
RuntimeError: aten.add_.Tensor: got mixed distributed and non-distributed tensors.
r0 Training Epoch: 0%| | 0/94 [00:01<?, ?it/s]
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 61805) of binary: /home/ubuntu/miniconda3/envs/pytorch/bin/python
Traceback (most recent call last):
File "/home/ubuntu/miniconda3/envs/pytorch/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/ubuntu/miniconda3/envs/pytorch/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
T5_training.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2023-10-12_15:54:31
host : xxxx
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 61806)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2023-10-12_15:54:31
host : xxxx
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 61807)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2023-10-12_15:54:31
host : xxxx
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 61808)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-10-12_15:54:31
host : xxxx
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 61805)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================