Situation:
My pytorch acted weird after changing my GPU (1070Ti > A6000 > 1070Ti). The message “RuntimeError: CUDA error: unknown error” occured randomly during training process. Sometimes, the error message can be other like “RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED”. It used to be OK before, after I change to A6000, the problem occured, then I change back to 1070Ti, the problem still there.
Spec:
OS: WSL2
nvidia driver: 551.86
GPU: 1070Ti, A6000
method has been tried:
changing driver with clean install
changing cudatoolkit 11.1, 10.2
recreate conda enviroment
some python traceback:
Traceback (most recent call last):
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in train
self.train_loop.run_training_epoch()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 550, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 718, in run_training_batch
self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 485, in optimizer_step
model_ref.optimizer_step(
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1298, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 286, in step
self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 144, in __optimizer_step
optimizer.step(closure=closure, *args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/optim/optimizer.py", line 89, in wrapper
return func(*args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/optim/sgd.py", line 87, in step
loss = closure()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 708, in train_step_and_backward_closure
result = self.training_step_and_backward(
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 816, in training_step_and_backward
self.backward(result, optimizer, opt_idx)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 842, in backward
result.closure_loss = self.trainer.accelerator_backend.backward(
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 109, in backward
model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1162, in backward
loss.backward(*args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/tensor.py", line 245, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/autograd/__init__.py", line 145, in backward
Variable._execution_engine.run_backward(
RuntimeError: CUDA error: unknown error
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./src/main.py", line 161, in <module>
main(args)
File "./src/main.py", line 98, in main
trainer.fit(model, datamodule=dm)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 510, in fit
results = self.accelerator_backend.train()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 57, in train
return self.train_or_test()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test
results = self.trainer.train()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 589, in train
self.train_loop.on_train_end()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 156, in on_train_end
self.check_checkpoint_callback(should_update=True, is_last=True)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 190, in check_checkpoint_callback
cb.on_validation_end(self.trainer, model)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 204, in on_validation_end
self.save_checkpoint(trainer, pl_module)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 254, in save_checkpoint
self._save_last_checkpoint(trainer, pl_module, monitor_candidates)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 569, in _save_last_checkpoint
self._save_model(last_filepath, trainer, pl_module)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 362, in _save_model
self.save_function(filepath, self.save_weights_only)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/properties.py", line 265, in save_checkpoint
self.checkpoint_connector.save_checkpoint(filepath, weights_only)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 398, in save_checkpoint
atomic_save(checkpoint, filepath)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/utilities/cloud_io.py", line 63, in atomic_save
torch.save(checkpoint, bytesbuffer)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/serialization.py", line 372, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/serialization.py", line 488, in _save
storage = storage.cpu()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/storage.py", line 72, in cpu
return self.type(getattr(torch, self.__class__.__name__))
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/cuda/__init__.py", line 496, in type
return super(_CudaBase, self).type(*args, **kwargs) # type: ignore[misc]
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/_utils.py", line 46, in _type
return dtype(self.size()).copy_(self, non_blocking)
RuntimeError: CUDA error: unknown error
Traceback (most recent call last):
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in train
self.train_loop.run_training_epoch()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 550, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 718, in run_training_batch
self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 485, in optimizer_step
model_ref.optimizer_step(
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1298, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 286, in step
self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 144, in __optimizer_step
optimizer.step(closure=closure, *args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/optim/optimizer.py", line 89, in wrapper
return func(*args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/optim/sgd.py", line 87, in step
loss = closure()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 708, in train_step_and_backward_closure
result = self.training_step_and_backward(
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 816, in training_step_and_backward
self.backward(result, optimizer, opt_idx)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 842, in backward
result.closure_loss = self.trainer.accelerator_backend.backward(
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 109, in backward
model.backward(closure_loss, optimizer, opt_idx, *args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1162, in backward
loss.backward(*args, **kwargs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/tensor.py", line 245, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/autograd/__init__.py", line 145, in backward
Variable._execution_engine.run_backward(
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
You can try to repro this exception using the following code snippet. If that doesn't trigger the error, please include your original repro script when reporting this issue.
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.allow_tf32 = True
data = torch.randn([1, 18, 107, 140], dtype=torch.float, device='cuda', requires_grad=True)
net = torch.nn.Conv2d(18, 18, kernel_size=[3, 3], padding=[1, 1], stride=[1, 1], dilation=[1, 1], groups=1)
net = net.cuda().float()
out = net(data)
out.backward(torch.randn_like(out))
torch.cuda.synchronize()
ConvolutionParams
data_type = CUDNN_DATA_FLOAT
padding = [1, 1, 0]
stride = [1, 1, 0]
dilation = [1, 1, 0]
groups = 1
deterministic = false
allow_tf32 = true
input: TensorDescriptor 0x55f6de3ed470
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 1, 18, 107, 140,
strideA = 269640, 14980, 140, 1,
output: TensorDescriptor 0x55f6de74e300
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 1, 18, 107, 140,
strideA = 269640, 14980, 140, 1,
weight: FilterDescriptor 0x7f4914128650
type = CUDNN_DATA_FLOAT
tensor_format = CUDNN_TENSOR_NCHW
nbDims = 4
dimA = 18, 18, 3, 3,
Pointer addresses:
input: 0x814469200
output: 0x804cce000
weight: 0x80219ba00
Additional pointer addresses:
grad_output: 0x804cce000
grad_input: 0x814469200
Backward data algorithm: 4
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./src/main.py", line 161, in <module>
main(args)
File "./src/main.py", line 98, in main
trainer.fit(model, datamodule=dm)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 510, in fit
results = self.accelerator_backend.train()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 57, in train
return self.train_or_test()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test
results = self.trainer.train()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 589, in train
self.train_loop.on_train_end()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 156, in on_train_end
self.check_checkpoint_callback(should_update=True, is_last=True)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 190, in check_checkpoint_callback
cb.on_validation_end(self.trainer, model)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 204, in on_validation_end
self.save_checkpoint(trainer, pl_module)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 254, in save_checkpoint
self._save_last_checkpoint(trainer, pl_module, monitor_candidates)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 569, in _save_last_checkpoint
self._save_model(last_filepath, trainer, pl_module)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 362, in _save_model
self.save_function(filepath, self.save_weights_only)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/properties.py", line 265, in save_checkpoint
self.checkpoint_connector.save_checkpoint(filepath, weights_only)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 398, in save_checkpoint
atomic_save(checkpoint, filepath)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/pytorch_lightning/utilities/cloud_io.py", line 63, in atomic_save
torch.save(checkpoint, bytesbuffer)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/serialization.py", line 372, in save
_save(obj, opened_zipfile, pickle_module, pickle_protocol)
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/serialization.py", line 488, in _save
storage = storage.cpu()
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/storage.py", line 72, in cpu
return self.type(getattr(torch, self.__class__.__name__))
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/cuda/__init__.py", line 496, in type
return super(_CudaBase, self).type(*args, **kwargs) # type: ignore[misc]
File "/home/mizuyoru/miniconda3/envs/demtnew/lib/python3.8/site-packages/torch/_utils.py", line 46, in _type
return dtype(self.size()).copy_(self, non_blocking)
RuntimeError: CUDA error: unknown error