CUDA_LAUNCH_BLOCKING = 1, the error msg shows up as below.
You can try to repro this exception using the following code snippet. If that doesn't trigger the error, please include your original repro script when reporting this issue.
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = True
data = torch.randn([512, 256, 32, 32], dtype=torch.float, device='cuda', requires_grad=True)
net = torch.nn.Conv2d(256, 256, kernel_size=[3, 3], padding=[1, 1], stride=[1, 1], dilation=[1, 1], groups=1)
net = net.cuda().float()
out = net(data)
out.backward(torch.randn_like(out))
torch.cuda.synchronize()
ConvolutionParams
data_type = CUDNN_DATA_FLOAT
padding = [1, 1, 0]
stride = [1, 1, 0]
dilation = [1, 1, 0]
groups = 1
deterministic = true
allow_tf32 = true
input: TensorDescriptor 0x9e26660
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 512, 256, 32, 32,
strideA = 262144, 1024, 32, 1,
output: TensorDescriptor 0x9e26260
type = CUDNN_DATA_FLOAT
nbDims = 4
dimA = 512, 256, 32, 32,
strideA = 262144, 1024, 32, 1,
weight: FilterDescriptor 0x9e7c850
type = CUDNN_DATA_FLOAT
tensor_format = CUDNN_TENSOR_NCHW
nbDims = 4
dimA = 256, 256, 3, 3,
Pointer addresses:
input: 0x7fd9ba000000
output: 0x7fd9da000000
weight: 0x7fdb28120000
Forward algorithm: 6
During handling of the above exception, another exception occurred:
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 499, in fit
self.dispatch()
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 546, in dispatch
self.accelerator.start_training(self)
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 73, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 114, in start_training
self._results = trainer.run_train()
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 671, in run_train
self.train_loop.on_train_end()
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py", line 134, in on_train_end
self.check_checkpoint_callback(should_update=True, is_last=True)
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py", line 164, in check_checkpoint_callback
cb.on_validation_end(self.trainer, model)
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 212, in on_validation_end
self.save_checkpoint(trainer, pl_module)
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 253, in save_checkpoint
monitor_candidates = self._monitor_candidates(trainer)
File "/home/tangke/.local/lib/python3.6/site-packages/pytorch_lightning/callbacks/model_checkpoint.py", line 515, in _monitor_candidates
monitor_candidates = deepcopy(trainer.logger_connector.callback_metrics)
File "/usr/lib/python3.6/copy.py", line 150, in deepcopy
y = copier(x, memo)
File "/usr/lib/python3.6/copy.py", line 240, in _deepcopy_dict
y[deepcopy(key, memo)] = deepcopy(value, memo)
File "/usr/lib/python3.6/copy.py", line 161, in deepcopy
y = copier(memo)
File "/home/tangke/.local/lib/python3.6/site-packages/torch-1.9.0-py3.6-linux-x86_64.egg/torch/_tensor.py", line 67, in __deepcopy__
new_storage = self.storage().__deepcopy__(memo)
File "/home/tangke/.local/lib/python3.6/site-packages/torch-1.9.0-py3.6-linux-x86_64.egg/torch/storage.py", line 48, in __deepcopy__
new_storage = self.clone()
File "/home/tangke/.local/lib/python3.6/site-packages/torch-1.9.0-py3.6-linux-x86_64.egg/torch/storage.py", line 64, in clone
return type(self)(self.size()).copy_(self)
RuntimeError: CUDA error: an illegal memory access was encountered