Hi @ptrblck , I am stuck on following error for 1 week. Can you please help fix this issue:
enqueue.cc:115 NCCL WARN Cuda failure 'named symbol not found'
user-0:2395:2560 [0] enqueue.cc:128 NCCL WARN Cuda failure 'named symbol not found'
user-0:2395:2560 [0] NCCL INFO init.cc:1369 -> 1
user-0:2395:2560 [0] NCCL INFO group.cc:65 -> 1 [Async thread]
user-0:2395:2395 [0] NCCL INFO group.cc:406 -> 1
user-0:2395:2395 [0] NCCL INFO group.cc:96 -> 1
2024-03-01 10:04:51,604 [INFO] - s3_checkpoint_io.py:200 - Entering teardown, waiting for all jobs to finish, rank 0
2024-03-01 10:04:51,617 [INFO] - s3_checkpoint_io.py:203 - executor shut down after 0.01 seconds, rank 0
Error executing job with overrides: ['user=user' ]
File "/train.py", line 312, in <module>
main()
File "/usr/local/lib/python3.10/dist-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 458, in <lambda>
lambda: hydra.run(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/train.py", line 268, in main
trainer.fit(model=model_module, datamodule=data_module)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 532, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 571, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 939, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1069, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1192, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/ddp.py", line 292, in broadcast
torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2716, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2019, in broadcast
work = default_pg.broadcast([tensor], opts)
Traceback (most recent call last):
File "/train.py", line 312, in <module>
main()
File "/usr/local/lib/python3.10/dist-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 458, in <lambda>
lambda: hydra.run(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/train.py", line 268, in main
trainer.fit(model=model_module, datamodule=data_module)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 532, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 571, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 939, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1069, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1192, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/ddp.py", line 292, in broadcast
torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2716, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2019, in broadcast
work = default_pg.broadcast([tensor], opts)
torch.distributed.DistBackendError: NCCL error in: /opt/pytorch/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1359, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.18.5
ncclUnhandledCudaError: Call to CUDA function failed.
Last error:
Cuda failure 'named symbol not found'
Exception raised from getNCCLComm at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1359 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xae (0x7fd5895b14ee in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0xbacd1a (0x7fd532c67d1a in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::broadcast(std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::BroadcastOptions const&) + 0x4a8 (0x7fd532f08398 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0x4e9c7a0 (0x7fd57fbc97a0 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #4: <unknown function> + 0x4eae2e3 (0x7fd57fbdb2e3 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0x4eb7811 (0x7fd57fbe4811 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0x4ec8971 (0x7fd57fbf5971 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xc459a5 (0x7fd5871439a5 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #8: <unknown function> + 0x3c47cf (0x7fd5868c27cf in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
<omitting python frames>
Error in training on pod user-0, ip address, instance id i-07b659df0fac77efe. Exception: NCCL error in: /opt/pytorch/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1359, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.18.5
ncclUnhandledCudaError: Call to CUDA function failed.
Last error:
Cuda failure 'named symbol not found'
Exception raised from getNCCLComm at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1359 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xae (0x7fd5895b14ee in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0xbacd1a (0x7fd532c67d1a in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::broadcast(std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::BroadcastOptions const&) + 0x4a8 (0x7fd532f08398 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0x4e9c7a0 (0x7fd57fbc97a0 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #4: <unknown function> + 0x4eae2e3 (0x7fd57fbdb2e3 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0x4eb7811 (0x7fd57fbe4811 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0x4ec8971 (0x7fd57fbf5971 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xc459a5 (0x7fd5871439a5 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #8: <unknown function> + 0x3c47cf (0x7fd5868c27cf in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
<omitting python frames>
File "/train.py", line 312, in <module>
main()
File "/usr/local/lib/python3.10/dist-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 458, in <lambda>
lambda: hydra.run(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/train.py", line 268, in main
trainer.fit(model=model_module, datamodule=data_module)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 532, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 571, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 939, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1069, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1192, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/ddp.py", line 292, in broadcast
torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2716, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2019, in broadcast
work = default_pg.broadcast([tensor], opts)
Traceback (most recent call last):
File "/train.py", line 329, in <module>
raise e
File "/train.py", line 312, in <module>
main()
File "/usr/local/lib/python3.10/dist-packages/hydra/main.py", line 94, in decorated_main
_run_hydra(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 394, in _run_hydra
_run_app(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 457, in _run_app
run_and_report(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 223, in run_and_report
raise ex
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 220, in run_and_report
return func()
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/utils.py", line 458, in <lambda>
lambda: hydra.run(
File "/usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py", line 132, in run
_ = ret.return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 260, in return_value
raise self._return_value
File "/usr/local/lib/python3.10/dist-packages/hydra/core/utils.py", line 186, in run_job
ret.return_value = task_function(task_cfg)
File "/train.py", line 268, in main
trainer.fit(model=model_module, datamodule=data_module)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 532, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 571, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 939, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1069, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1192, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/ddp.py", line 292, in broadcast
torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2716, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2019, in broadcast
work = default_pg.broadcast([tensor], opts)
torch.distributed.DistBackendError: NCCL error in: /opt/pytorch/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1359, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.18.5
ncclUnhandledCudaError: Call to CUDA function failed.
Last error:
Cuda failure 'named symbol not found'
Exception raised from getNCCLComm at /opt/pytorch/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1359 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xae (0x7fd5895b14ee in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0xbacd1a (0x7fd532c67d1a in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::broadcast(std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::BroadcastOptions const&) + 0x4a8 (0x7fd532f08398 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0x4e9c7a0 (0x7fd57fbc97a0 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #4: <unknown function> + 0x4eae2e3 (0x7fd57fbdb2e3 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0x4eb7811 (0x7fd57fbe4811 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0x4ec8971 (0x7fd57fbf5971 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xc459a5 (0x7fd5871439a5 in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
frame #8: <unknown function> + 0x3c47cf (0x7fd5868c27cf in /usr/local/lib/python3.10/dist-packages/torch/lib/libtorch_python.so)
<omitting python frames>
user-0:2395:2395 [0] NCCL INFO comm 0x55f5f56e42d0 rank 0 nranks 0 cudaDev 0 busId 0 - Abort COMPLETE
Appreciate your help on this!