I got this error when it gets to 95% of Epoch 0 to load validation set to test the performance of training set.
Epoch 0: 95%|████████████████ERROR: Unexpected segmentation fault encountered in worker.6<00:11, 4.30it/s, loss=nan]
ERROR: Unexpected segmentation fault encountered in worker.
ERROR: Unexpected segmentation fault encountered in worker.
ERROR: Unexpected segmentation fault encountered in worker.
ERROR: Unexpected segmentation fault encountered in worker.
ERROR: Unexpected segmentation fault encountered in worker.
Traceback (most recent call last):
File “/home/se/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 1120, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File “/home/lib/python3.10/queue.py”, line 180, in get
self.not_empty.wait(remaining)
File “/home/lib/python3.10/threading.py”, line 324, in wait
gotit = waiter.acquire(True, timeout)
File “/home/lib/python3.10/site-packages/torch/utils/data/_utils/signal_handling.py”, line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 1202763) is killed by signal: Segmentation fault.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py”, line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py”, line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py”, line 1103, in _run
results = self._run_stage()
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py”, line 1182, in _run_stage
self._run_train()
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py”, line 1205, in _run_train
self.fit_loop.run()
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py”, line 199, in run
self.advance(*args, **kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py”, line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py”, line 200, in run
self.on_advance_end()
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py”, line 250, in on_advance_end
self._run_validation()
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py”, line 308, in _run_validation
self.val_loop.run()
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py”, line 199, in run
self.advance(*args, **kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py”, line 152, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py”, line 199, in run
self.advance(*args, **kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py”, line 121, in advance
batch = next(data_fetcher)
File “/home/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py”, line 184, in next
return self.fetching_function()
File “/home/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py”, line 265, in fetching_function
self._fetch_next_batch(self.dataloader_iter)
File “/home/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py”, line 280, in _fetch_next_batch
batch = next(iterator)
File “/home/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 628, in next
data = self._next_data()
File “/home/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 1316, in _next_data
idx, data = self._get_data()
File “/home/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 1272, in _get_data
success, data = self._try_get_data()
File “/home/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 1133, in _try_get_data
raise RuntimeError(‘DataLoader worker (pid(s) {}) exited unexpectedly’.format(pids_str)) from e
RuntimeError: DataLoader worker (pid(s) 1202763) exited unexpectedly
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “/home/bin/casanovo”, line 8, in
sys.exit(main())
File “/home/lib/python3.10/site-packages/click/core.py”, line 1130, in call
return self.main(*args, **kwargs)
File “/home/lib/python3.10/site-packages/click/core.py”, line 1055, in main
rv = self.invoke(ctx)
File “/home/lib/python3.10/site-packages/click/core.py”, line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File “/home/lib/python3.10/site-packages/click/core.py”, line 760, in invoke
return __callback(*args, **kwargs)
File “/home/lib/python3.10/site-packages/casanovo/casanovo.py”, line 256, in main
model_runner.train(peak_path, peak_feature, peak_path_val, peak_feature_val, model, config)
File “/home/lib/python3.10/site-packages/casanovo/denovo/model_runner.py”, line 320, in train
trainer.fit(
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py”, line 608, in fit
call._call_and_handle_interrupt(
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py”, line 59, in _call_and_handle_interrupt
trainer.strategy.reconciliate_processes(traceback.format_exc())
File “/home/lib/python3.10/site-packages/pytorch_lightning/strategies/ddp.py”, line 460, in reconciliate_processes
raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 1
Traceback (most recent call last):
File “/home/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 1120, in _try_get_data
data = self._data_queue.get(timeout=timeout)
File “/home/lib/python3.10/queue.py”, line 180, in get
self.not_empty.wait(remaining)
File “/home/lib/python3.10/threading.py”, line 324, in wait
gotit = waiter.acquire(True, timeout)
File “/home/lib/python3.10/site-packages/torch/utils/data/_utils/signal_handling.py”, line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 1202763) is killed by signal: Segmentation fault.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py”, line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py”, line 650, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py”, line 1103, in _run
results = self._run_stage()
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py”, line 1182, in _run_stage
self._run_train()
File “/home/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py”, line 1205, in _run_train
self.fit_loop.run()
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py”, line 199, in run
self.advance(*args, **kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py”, line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py”, line 200, in run
self.on_advance_end()
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py”, line 250, in on_advance_end
self._run_validation()
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py”, line 308, in _run_validation
self.val_loop.run()
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py”, line 199, in run
self.advance(*args, **kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py”, line 152, in advance
dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/loop.py”, line 199, in run
self.advance(*args, **kwargs)
File “/home/lib/python3.10/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py”, line 121, in advance
batch = next(data_fetcher)
File “/home/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py”, line 184, in next
return self.fetching_function()
File “/home/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py”, line 265, in fetching_function
self._fetch_next_batch(self.dataloader_iter)
File “/home/lib/python3.10/site-packages/pytorch_lightning/utilities/fetching.py”, line 280, in _fetch_next_batch
batch = next(iterator)
File “/home/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 628, in next
data = self._next_data()
File “/home/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 1316, in _next_data
idx, data = self._get_data()
File “/home/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 1272, in _get_data
success, data = self._try_get_data()
File “/home/lib/python3.10/site-packages/torch/utils/data/dataloader.py”, line 1133, in _try_get_data
raise RuntimeError(‘DataLoader worker (pid(s) {}) exited unexpectedly’.format(pids_str)) from e
RuntimeError: DataLoader worker (pid(s) 1202763) exited unexpectedly
Killed
Surprisingly, after getting this error all gpus will be invisible so torch.cuda.is_available() returns False and cannot find the gpus anymore.