Getting CUDNN_STATUS_EXECUTION_FAILED while using LSTM

username1 · January 12, 2018, 4:42pm

Hello to everyone I am experiencing some problems with recurrent neural networks,
seems they are CUDA related, since the error disappears if cuda is not used.

Here is a toy example of my code:

import torch 
import torch.nn as nn
from torch.autograd import Variable

import argparse

opt = {
    'ntokens': 81,
    'batch_size' : 8,
    'imgW' : 1758,
    'imgH' : 119,
    'workers' : 2,
    'cuda' : True,
}
opt = argparse.Namespace(**opt)

class BidirectionalLSTM(nn.Module):

    def __init__(self, nIn, nHidden, nOut):
        super(BidirectionalLSTM, self).__init__()

        self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
        self.embedding = nn.Linear(nHidden * 2, nOut)
        
    def forward(self, input):
        recurrent, _ = self.rnn(input)
        T, b, h = recurrent.size()
        t_rec = recurrent.view(T * b, h)

        output = self.embedding(t_rec)  # [T * b, nOut]
        output = output.view(T, b, -1)

        return output



class ModelPretrain(nn.Module):
    def __init__(self):
        super(ModelPretrain, self).__init__()
        nh = 128
        self.input_size = 128
        self.encoder = nn.Embedding(opt.ntokens, self.input_size)
        self.decoder = nn.Sequential(
            BidirectionalLSTM(128, nh, nh),
            BidirectionalLSTM(nh, nh, opt.ntokens))
    def forward(self, x):
        print('1',x.size())
        x = self.encoder(x)
        print('2',x.size())
        x = x.permute(1, 0, 2)
        print('3',x.size())
        x = self.decoder(x)
        print('4',x.size())
        return x

gpu_text = torch.LongTensor(opt.batch_size * opt.imgW).fill_(0)
gpu_text = Variable(gpu_text)


model = ModelPretrain()

if opt.cuda:
   gpu_text = gpu_text.cuda()
   model.cuda()

res = model(gpu_text.view(opt.batch_size, -1))
print(res)

Here is the error message:

1 torch.Size([8, 1758])
2 torch.Size([8, 1758, 128])
3 torch.Size([1758, 8, 128])
---------------------------------------------------------------------------
CuDNNError                                Traceback (most recent call last)
~/x/d/crnn.pytorch2/toy.py in <module>()
     63 model.cuda()
     64 
---> 65 res = model(gpu_text.view(opt.batch_size, -1))
     66 print(res)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    323         for hook in self._forward_pre_hooks.values():
    324             hook(self, input)
--> 325         result = self.forward(*input, **kwargs)
    326         for hook in self._forward_hooks.values():
    327             hook_result = hook(self, input, result)

~/x/d/crnn.pytorch2/toy.py in forward(self, x)
     51         x = x.permute(1, 0, 2)
     52         print('3',x.size())
---> 53         x = self.decoder(x)
     54         print('4',x.size())
     55         return x

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    323         for hook in self._forward_pre_hooks.values():
    324             hook(self, input)
--> 325         result = self.forward(*input, **kwargs)
    326         for hook in self._forward_hooks.values():
    327             hook_result = hook(self, input, result)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/container.py in forward(self, input)
     65     def forward(self, input):
     66         for module in self._modules.values():
---> 67             input = module(input)
     68         return input
     69 

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    323         for hook in self._forward_pre_hooks.values():
    324             hook(self, input)
--> 325         result = self.forward(*input, **kwargs)
    326         for hook in self._forward_hooks.values():
    327             hook_result = hook(self, input, result)

~/x/d/crnn.pytorch2/toy.py in forward(self, input)
     24         
     25     def forward(self, input):
---> 26         recurrent, _ = self.rnn(input)
     27         T, b, h = recurrent.size()
     28         t_rec = recurrent.view(T * b, h)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    323         for hook in self._forward_pre_hooks.values():
    324             hook(self, input)
--> 325         result = self.forward(*input, **kwargs)
    326         for hook in self._forward_hooks.values():
    327             hook_result = hook(self, input, result)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
    167             flat_weight=flat_weight
     25     def forward(self, input):
---> 26         recurrent, _ = self.rnn(input)
     27         T, b, h = recurrent.size()
     28         t_rec = recurrent.view(T * b, h)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    323         for hook in self._forward_pre_hooks.values():
    324             hook(self, input)
--> 325         result = self.forward(*input, **kwargs)
    326         for hook in self._forward_hooks.values():
    327             hook_result = hook(self, input, result)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
    167             flat_weight=flat_weight
    168         )
--> 169         output, hidden = func(input, self.all_weights, hx)
    170         if is_packed:
    171             output = PackedSequence(output, batch_sizes)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in forward(input, *fargs, **fkwargs)
    383             return hack_onnx_rnn((input,) + fargs, output, args, kwargs)
    384         else:
--> 385             return func(input, *fargs, **fkwargs)
    386 
    387     return forward

~/x/miniconda3/lib/python3.6/site-packages/torch/autograd/function.py in _do_forward(self, *input)
    326         self._nested_input = input
    327         flat_input = tuple(_iter_variables(input))
--> 328         flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
    329         nested_output = self._nested_output
    330         nested_variables = _unflatten(flat_output, self._nested_output)

~/x/miniconda3/lib/python3.6/site-packages/torch/autograd/function.py in forward(self, *args)
    348     def forward(self, *args):
    349         nested_tensors = _map_variable_tensor(self._nested_input)
--> 350         result = self.forward_extended(*nested_tensors)
    351         del self._nested_input
    352         self._nested_output = result

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in forward_extended(self, input, weight, hx)
    292             hy = tuple(h.new() for h in hx)
    293 
--> 294         cudnn.rnn.forward(self, input, hx, weight, output, hy)
    295 
    296         self.save_for_backward(input, hx, weight, output)

~/x/miniconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in forward(fn, input, hx, weight, output, hy)
    303                 fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None,
    304                 ctypes.c_void_p(workspace.data_ptr()), workspace.size(0),
--> 305                 ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
    306             ))
    307         else:  # inference

~/x/miniconda3/lib/python3.6/site-packages/torch/backends/cudnn/__init__.py in check_error(status)
    281 def check_error(status):
    282     if status is not 0:
--> 283         raise CuDNNError(status)
    284 
    285 

CuDNNError: 8: b'CUDNN_STATUS_EXECUTION_FAILED'

Here are the cuda version and the GPU card

000 0 $ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2017 NVIDIA Corporation
Built on Fri_Sep__1_21:08:03_CDT_2017
Cuda compilation tools, release 9.0, V9.0.176
000 0 $ nvidia-smi
Fri Jan 12 19:38:11 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111                Driver Version: 384.111                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 1070    Off  | 00000000:01:00.0  On |                  N/A |
| 33%   33C    P8     8W / 190W |     42MiB /  8110MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1003      G   /usr/lib/xorg/Xorg                            40MiB |
+-----------------------------------------------------------------------------+

The torch version:

In [10]: torch.__version__
Out[10]: '0.3.0.post4'

SimonW · January 12, 2018, 8:25pm

Interesting, let’s first see if the code runs without cudnn. Could you try torch.backends.cudnn.enabled=False? This runs the RNN code that isn’t from cudnn. It will be slower compared to cudnn, but is a good thing to try and see if there is anything wrong with our inputs.

ngimel · January 14, 2018, 4:34am

Try adding .contiguous() call after .permute():

x=x.permute(1,0,2).contiguous()

username1 · January 14, 2018, 10:35am

Very wired, the error in the toy program disappeared but now this happens if I run the main NN code
(with or without cudnn)

THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1512387374934/work/torch/lib/THC/generic/THCTensorCopy.c line=20 error=6 : the launch timed out and was terminated
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
~/x/d/crnn.pytorch2/main.py in <module>()
    341 try:
    342     engine.train(opt.display_interval, opt.val_interval,
--> 343                  opt.save_interval, opt.epochs)
    344 except KeyboardInterrupt:
    345     engine.save()

~/x/d/crnn.pytorch2/engine.py in train(self, display_interval, val_interval, save_interval, epochs)
    123                 self.model.train()
    124 
--> 125                 cost, preds, skip = self.exec_batch(self.model, batch)
    126                 if skip: continue
    127 

~/x/d/crnn.pytorch2/main.py in exec_batch(model, batch)
    211 
    212 def exec_batch(model, batch):
--> 213     utils.loadData(image, batch.images)
    214     t, l = converter.encode(batch.labels_text)
    215     utils.loadData(text, t)

~/x/d/crnn.pytorch2/utils.py in loadData(v, data)
    148 
    149 def loadData(v, data):
--> 150     v.data.resize_(data.size()).copy_(data)
    151 
    152 

RuntimeError: cuda runtime error (6) : the launch timed out and was terminated at /opt/conda/conda-bld/pytorch_1512387374934/work/torch/lib/THC/generic/THCTensorCopy.c:20

Sometimes it looks like this

THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1512387374934/work/torch/lib/THC/generic/THCStorage.cu line=58 error=6 : the launch timed out and was terminated
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
~/x/d/crnn.pytorch2/main.py in <module>()
    116 
    117 #gpu_text = gpu_text.cuda()
--> 118 image = image.cuda()
    119 #pretrain = pretrain.cuda()
    120 ctc_criterion = ctc_criterion.cuda()

~/x/miniconda3/lib/python3.6/site-packages/torch/autograd/variable.py in cuda(self, device, async)
    296 
    297     def cuda(self, device=None, async=False):
--> 298         return CudaTransfer.apply(self, device, async)
    299 
    300     def cpu(self):

~/x/miniconda3/lib/python3.6/site-packages/torch/autograd/_functions/tensor.py in forward(ctx, i, device, async)
    199             return i.cuda(device, async=async)
    200         else:
--> 201             return i.cuda(async=async)
    202 
    203     @staticmethod

~/x/miniconda3/lib/python3.6/site-packages/torch/_utils.py in _cuda(self, device, async)
     67         else:
     68             new_type = getattr(torch.cuda, self.__class__.__name__)
---> 69             return new_type(self.size()).copy_(self, async)
     70 
     71 

~/x/miniconda3/lib/python3.6/site-packages/torch/cuda/__init__.py in _lazy_new(cls, *args, **kwargs)
    359     # We need this method only for lazy init, so we can remove it
    360     del _CudaBase.__new__
--> 361     return super(_CudaBase, cls).__new__(cls, *args, **kwargs)
    362 
    363 

RuntimeError: cuda runtime error (6) : the launch timed out and was terminated at /opt/conda/conda-bld/pytorch_1512387374934/work/torch/lib/THC/generic/THCStorage.cu:58

UPD:
Okay, now, with cudnn enabled, it sometimes runs one or two iterations and crashes with this error:

        [    0][  10/1502] | Loss: 621.920557 |                      : system of recording temperatures .
---------------------------------------------------------------------------
CuDNNError                                Traceback (most recent call last)
~/x/d/crnn.pytorch2/main.py in <module>()
    341 try:                                                                                                                                                                                              [100/696]
    342     engine.train(opt.display_interval, opt.val_interval,
--> 343                  opt.save_interval, opt.epochs)
    344 except KeyboardInterrupt:
    345     engine.save()

~/x/d/crnn.pytorch2/engine.py in train(self, display_interval, val_interval, save_interval, epochs)
    123                 self.model.train()
    124 
--> 125                 cost, preds, skip = self.exec_batch(self.model, batch)
    126                 if skip: continue
    127 

~/x/d/crnn.pytorch2/main.py in exec_batch(model, batch)
    215     utils.loadData(text, t)
    216     utils.loadData(length, l)
--> 217     preds, cost, skip = criterion(batch)
    218     return cost, preds, skip
    219 

~/x/d/crnn.pytorch2/main.py in plain_crit(batch)
    224 
    225 def plain_crit(batch):
--> 226     return _ctc_crit(model(image))
    227 
    228 def dpret_crit(batch):

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    323         for hook in self._forward_pre_hooks.values():
    324             hook(self, input)
--> 325         result = self.forward(*input, **kwargs)
    326         for hook in self._forward_hooks.values():
    327             hook_result = hook(self, input, result)

~/x/d/crnn.pytorch2/models/gcrnn.py in forward(self, x)
    134         x = x.permute(2, 0, 1).contiguous()  # [w, b, c]
    135 
--> 136         output = self.decoder(x)
    137 
    138         return output

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    323         for hook in self._forward_pre_hooks.values():
    324             hook(self, input)
--> 325         result = self.forward(*input, **kwargs)
    326         for hook in self._forward_hooks.values():
    327             hook_result = hook(self, input, result)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/container.py in forward(self, input)
     65     def forward(self, input):
     66         for module in self._modules.values():
---> 67             input = module(input)
     68         return input
     69 

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    323         for hook in self._forward_pre_hooks.values():
    324             hook(self, input)
--> 325         result = self.forward(*input, **kwargs)
    326         for hook in self._forward_hooks.values():
    327             hook_result = hook(self, input, result)

~/x/d/crnn.pytorch2/models/gcrnn.py in forward(self, input)
     68                                                                                                                                                                                                    [37/696]
     69     def forward(self, input):
---> 70         recurrent, _ = self.rnn(input)
     71         T, b, h = recurrent.size()
     72         t_rec = recurrent.view(T * b, h)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    323         for hook in self._forward_pre_hooks.values():
    324             hook(self, input)
--> 325         result = self.forward(*input, **kwargs)
    326         for hook in self._forward_hooks.values():
    327             hook_result = hook(self, input, result)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
    167             flat_weight=flat_weight
    168         )
--> 169         output, hidden = func(input, self.all_weights, hx)
    170         if is_packed:
    171             output = PackedSequence(output, batch_sizes)

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in forward(input, *fargs, **fkwargs)
    383             return hack_onnx_rnn((input,) + fargs, output, args, kwargs)
    384         else:
--> 385             return func(input, *fargs, **fkwargs)
    386 
    387     return forward

~/x/miniconda3/lib/python3.6/site-packages/torch/autograd/function.py in _do_forward(self, *input)
    326         self._nested_input = input
    327         flat_input = tuple(_iter_variables(input))
--> 328         flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
    329         nested_output = self._nested_output
    330         nested_variables = _unflatten(flat_output, self._nested_output)

~/x/miniconda3/lib/python3.6/site-packages/torch/autograd/function.py in forward(self, *args)
    348     def forward(self, *args):
    349         nested_tensors = _map_variable_tensor(self._nested_input)
--> 350         result = self.forward_extended(*nested_tensors)
    351         del self._nested_input
    352         self._nested_output = result

~/x/miniconda3/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in forward_extended(self, input, weight, hx)
    292             hy = tuple(h.new() for h in hx)
    293 
--> 294         cudnn.rnn.forward(self, input, hx, weight, output, hy)
    295 
    296         self.save_for_backward(input, hx, weight, output)

~/x/miniconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in forward(fn, input, hx, weight, output, hy)
    303                 fn.cy_desc, ctypes.c_void_p(cy.data_ptr()) if cx is not None else None,
    304                 ctypes.c_void_p(workspace.data_ptr()), workspace.size(0),
--> 305                 ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
    306             ))
    307         else:  # inference

~/x/miniconda3/lib/python3.6/site-packages/torch/backends/cudnn/__init__.py in check_error(status)
    281 def check_error(status):
    282     if status is not 0:
--> 283         raise CuDNNError(status)
    284 
    285 

CuDNNError: 8: b'CUDNN_STATUS_EXECUTION_FAILED'

UPD:
The code that was crashing was the code responsible for pre training the network, if I use the pretrained weights all the upprementined happen, if I do not - then training proceeds, but it is quite slow, I also found out that computer beeps, not with the internal beeper but quite softly, while training the network, so most probably it is a hardware problem, what is the best way to diagnose it ? The beeping sound seem to arrive from the video card or the motherboard.

username1 · January 14, 2018, 3:02pm

Okay, it might be not the appropriate place to write this but here’s what happened.

I called a friend and he advised me to run some gpu benchmarks also he advised me to check if the video card did not move out of the PCI slot, he said that it happened sometimes to him, so I disassembled the computer and check , all was ok, I assembled it and started the training argain: all was ok – the program did not crash but the sound continued. After sometime I noticed that the GPU fan was off (the card is water cooled), I forgot to wire the cooler back , I immediately turned the machine off, plugged the gpu fan and switched it on again, I did not check the temperature but it felt really hot in there.

Now the beeping sound is gone, all is warking, knock on wood three times.

Thanks for you time @SimonW and @ngimel , hope it will not happen again

kenares · February 3, 2019, 12:40pm

Hi, I’m having the exact same issue. I disabled cudnn and all when good but the process is slow.
What should I do ? any hint on where I should start
Thanks