CUDNN_STATUS_INTERNAL_ERROR on Windows using GRU

I am building a Siamese GRU autoencoder, I am encountering some error when trying to train. The error occurs frequently, with both synthetic and real data. I have searched for information on this issue, and found suggestions for using cuddn 7.6, but there is no windows conda equivalent package. Any suggestions would be greatly appreciated, thanks :slight_smile:

The Error:

cuda
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-7-4751c89df448> in <module>
     94 
     95 
---> 96         score, y_pred1,y_pred2 = model(x1,x2) #train
     97         optimizer.zero_grad()
     98 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    530             result = self._slow_forward(*input, **kwargs)
    531         else:
--> 532             result = self.forward(*input, **kwargs)
    533         for hook in self._forward_hooks.values():
    534             hook_result = hook(self, input, result)

<ipython-input-7-4751c89df448> in forward(self, seq1, seq2)
     51         h1 = X2.new_zeros(self.num_layers*2,N,self.embedding_dim)
     52         h1.normal_()
---> 53         y2, _ = self.decoder(encoded2,h1)
     54         y2 =  y2[:seqLen2]
     55 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    530             result = self._slow_forward(*input, **kwargs)
    531         else:
--> 532             result = self.forward(*input, **kwargs)
    533         for hook in self._forward_hooks.values():
    534             hook_result = hook(self, input, result)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\torch\nn\modules\rnn.py in forward(self, input, hx)
    714         if batch_sizes is None:
    715             result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,
--> 716                              self.dropout, self.training, self.bidirectional, self.batch_first)
    717         else:
    718             result = _VF.gru(input, batch_sizes, hx, self._flat_weights, self.bias,

RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR

The code, with synthetic data.

import torch
import torch.nn as nn
import torch.optim as optim
import torchnet as tnt


class GRUModel(nn.Module):
    def __init__(self, embedding_dim, input_dim, latent_dim, num_layers):
        super(GRUModel, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(input_dim,embedding_dim)

        self.encoder = nn.GRU(self.embedding_dim, self.latent_dim, self.num_layers, bidirectional = True)

        self.decoder = nn.GRU(self.latent_dim, self.embedding_dim, self.num_layers, bidirectional = True)

    def forward(self, seq1, seq2 ):
        # Encode
        X1 = self.embedding(seq1).float().transpose(0,1)
        X2 = self.embedding(seq2).float().transpose(0,1)
        
        
        seqLen1,N,C = X1.shape
        seqLen2,N,C = X2.shape
        
        h0 = X1.new_zeros(self.num_layers*2,N,self.latent_dim)
        h0.normal_()
        
        _, last_hidden1 = self.encoder(X1,h0)
        encoded1 = last_hidden1[-1].repeat((len(X1),1,1))
        
        h0 = X2.new_zeros(self.num_layers*2,N,self.latent_dim)
        h0.normal_()
        
        _, last_hidden2 = self.encoder(X2,h0)
        encoded2 = last_hidden2[-1].repeat((len(X2),1,1))
        
        score = nn.CosineSimilarity()(last_hidden1[-1],last_hidden2[-1])

        # Decode
                
        h1 = X1.new_zeros(self.num_layers*2,N,self.embedding_dim)
        h1.normal_()
        y1, _ = self.decoder(encoded1,h1)
        y1 =  y1[:seqLen1]
        
        h1 = X2.new_zeros(self.num_layers*2,N,self.embedding_dim)
        h1.normal_()
        y2, _ = self.decoder(encoded2,h1)
        y2 =  y2[:seqLen2]

        
        #we have a reconstruction in both directions. We can try multiple ways to combine the output, we first try average
        y1 = y1.view(seqLen1,N,C,2).mean(-1)
        y2 = y2.view(seqLen2,N,C,2).mean(-1)
        
        l_first = nn.functional.l1_loss(y1.squeeze(),X1.squeeze())
        l_second = nn.functional.l1_loss(y2.squeeze(),X2.squeeze())
        
        return score, l_first,l_second
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = GRUModel(embedding_dim=50,input_dim=22, latent_dim=16, num_layers=2)
model.to(device)

loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters())



x1 = torch.randint(low=0,high=22,size=(1,250)).long().to(device)

x2 = torch.randint(low=0,high=22,size=(1,200)).long().to(device)
meter = tnt.meter.AverageValueMeter()
meter_val = tnt.meter.AverageValueMeter()
timemeter = tnt.meter.TimeMeter(0)
for epoch in range(50):
    timemeter.reset()
    model.train()
    #for (x1,x2) in trainloader:
    for idx in range(5):
        
        s1 = torch.randint(100,4000,(1,))
        x1 = torch.randint(low=0,high=22,size=(1,s1)).long().to(device)
        s1 = torch.randint(100,4000,(1,))
        x2 = torch.randint(low=0,high=22,size=(1,s1)).long().to(device)

        x1 = x1.to(device)
        x2 = x2.to(device)


        score, y_pred1,y_pred2 = model(x1,x2) #train
        optimizer.zero_grad()

        loss = (1-score) + y_pred1 + y_pred2
        loss.backward()
        optimizer.step()
        meter.add(loss.item())
        #print(loss.item())
    model.eval()
    #for (x1,x2) in valloader:
    for idx in range(5):
        
        s1 = torch.randint(100,4000,(1,))
        x1 = torch.randint(low=0,high=22,size=(1,s1)).long().to(device)
        s1 = torch.randint(100,4000,(1,))
        x2 = torch.randint(low=0,high=22,size=(1,s1)).long().to(device)


        x1 = x1.to(device)
        x2 = x2.to(device)


        score, y_pred1,y_pred2 = model(x1,x2) #eval

        loss = (1-score) + y_pred1 + y_pred2

        meter_val.add(loss.item())
        #print(loss.item())
    print("EPOCH {}, TRAINING_LOSS {}, VALIDATION_LOSS {}, DURATION {}s".format(epoch,meter.value(),meter_val.value(),timemeter.value()))
    meter.reset()
    meter_val.reset()

My System:

On windows 10, using a conda install of pytorch:

pytorch 1.4.0 py3.7_cuda101_cudnn7_0 pytorch
cudatoolkit 10.1.243 h74a9793_0

nvidia-smi:
Sun Mar 29 11:14:53 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 442.50       Driver Version: 442.50       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Quadro P2000       WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   51C    P8    N/A /  N/A |    486MiB /  4096MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0     35288      C   ...ta\Local\Continuum\anaconda3\python.exe N/A      |
+-----------------------------------------------------------------------------+
1 Like

Could you please take a look? cc @ngimel

How long does it take on your system to yield this error?
I can successfully run the code for all 50 epochs on Windows10 using PyTorch 1.5.0.dev20200331, CUDA10.1, cudnn7603.

In some cases it takes about 40 epochs, other cases it is immediate. I am not sure if it happens when looking at longer “sequence” lengths. In one case, were it failed immediately, the sequences were 1626 and 1633 in length, respectively.

I suspect the issue may be solved with a more up to date cudnn installation. For the environment you ran the code for @ptrblck, is there a conda package for that cudnn pytorch install? My laptop is running a quadro, and has some unique administrative roadblocks to install anything. If there isn’t a conda install, do you have instructions for how to get that going with a conda environment, would it require building from source? Thank you very much in advance for the help.

I just used the conda package for the nightly build:

conda install pytorch cudatoolkit=10.1 -c pytorch-nightly -c defaults -c conda-forge

Could you try to install it and rerun the code?
Let me know, if you are still running into this issue and I’ll try to run the code for more epochs.

Sorry for the delay. I am unfortunately still getting the error, and sometimes I get CUDNN_STATUS_EXECUTION_FAILED.

I created two pytorch environments, the first was using the exact command related to the pytorch nightly build. The second was cloning the pytorch nightly branch and compiling from source against cudatoolkit 10.2, and cuDNN 7.6.5. Pytorch takes awhile to build and the windows instructions on building was non-intuitive in certain areas, hence the delay in my response.

Both environments result in similar error profiles.

Here is a gist for the errors from the pytorch nightly environment: pytorch-nightly-cuDNN-GRU-errors

Does setting the following variables help?

import torch.backends.cudnn
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

Still similar errors, tested in the following environments:

pytorch nightly v1.5, cudatoolkit 10.1, installed using conda
pytorch nightly v1.5, cudatoolkit 10.2, installed using conda
pytorch v1.4, cudatoolkit 10.1, installed using conda
pytorch v1.6, cudatoolkit 10.2, cudnn7.6.5, installed from source

number of successful epochs varied from 1-6

Could you observe the memory usage via nvidia-smi?
We’ve seen CUDNN_STATUS_EXECUTION_FAILED being raised in the past due to an OOM during the cudnn call.

I think that might actually be the problem, but can’t confirm quite yet.

I have a nvidia quadro P2000 (which notoriously has low memory) , using nvidia-smi just gives N/A for the gpu-usage, as I don’t have windows display device model (WDDM), and using the -dmi flag doesnt work as my actual display is linked to the gpu. I have researched trying to find ways to measure gpu usage for code running, but haven’t quite found a way (also gpustat doesnt work, as _curses is not supported on windows for python 3.7). Any suggestions? Or I just look at task-manager…

As an aside, I am starting to dislike DL training on windows…

Using the following code:

print('Allocated:', torch.cuda.memory_allocated(0)/1024.0**3, 'GB')
print('Cached:   ', torch.cuda.memory_cached(0)/1024.0**3, 'GB')

I get, the following after the crash

#before training:
Allocated: 0.00029277801513671875 GB
Cached:    0.001953125 GB

# training, and after failure
Allocated: 0.002089977264404297 GB
Cached:    0.060546875 GB

This is well below the max memory of my GPU

Could you add these memory stat prints into your training loop and see, if they come dangerously close to the limit?
The memory stats might not reflect the usage during training, if you already hit the OOM and thus potentially the cudnn error.

This is the result

Quadro P2000
Memory Usage Training Epoch 0, after Batch 0:
Allocated: 0.0012102127075195312 GB
Cached:    0.025390625 GB
Quadro P2000
Memory Usage Training Epoch 0, after Batch 1:
Allocated: 0.001216888427734375 GB
Cached:    0.0390625 GB
Quadro P2000
Memory Usage Training Epoch 0, after Batch 2:
Allocated: 0.0012154579162597656 GB
Cached:    0.0390625 GB
Quadro P2000
Memory Usage Training Epoch 0, after Batch 3:
Allocated: 0.0012187957763671875 GB
Cached:    0.0390625 GB
Quadro P2000
Memory Usage Training Epoch 0, after Batch 4:
Allocated: 0.0012211799621582031 GB
Cached:    0.0546875 GB
Memory Usage Validation Epoch 0, after Batch 0:
Allocated: 0.0059795379638671875 GB
Cached:    0.0546875 GB
Memory Usage Validation Epoch 0, after Batch 1:
Allocated: 0.007592201232910156 GB
Cached:    0.056640625 GB
Memory Usage Validation Epoch 0, after Batch 2:
Allocated: 0.0030107498168945312 GB
Cached:    0.056640625 GB
Memory Usage Validation Epoch 0, after Batch 3:
Allocated: 0.005007266998291016 GB
Cached:    0.056640625 GB
Memory Usage Validation Epoch 0, after Batch 4:
Allocated: 0.003980159759521484 GB
Cached:    0.056640625 GB
EPOCH 0, TRAINING_LOSS (2.2453632831573485, 0.1989474530886867), VALIDATION_LOSS (2.2376667261123657, 0.40168538297584394), DURATION 2.966729164123535s
Quadro P2000
Memory Usage Training Epoch 1, after Batch 0:
Allocated: 0.0012125968933105469 GB
Cached:    0.056640625 GB
Quadro P2000
Memory Usage Training Epoch 1, after Batch 1:
Allocated: 0.001220703125 GB
Cached:    0.056640625 GB
Quadro P2000
Memory Usage Training Epoch 1, after Batch 2:
Allocated: 0.0012502670288085938 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 1, after Batch 3:
Allocated: 0.0012288093566894531 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 1, after Batch 4:
Allocated: 0.0012106895446777344 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 1, after Batch 0:
Allocated: 0.005021572113037109 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 1, after Batch 1:
Allocated: 0.005797863006591797 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 1, after Batch 2:
Allocated: 0.0034432411193847656 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 1, after Batch 3:
Allocated: 0.005297183990478516 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 1, after Batch 4:
Allocated: 0.0065517425537109375 GB
Cached:    0.09375 GB
EPOCH 1, TRAINING_LOSS (2.2856598854064942, 0.3632478665416056), VALIDATION_LOSS (2.1910706043243406, 0.1504089364643012), DURATION 3.4313414096832275s
Quadro P2000
Memory Usage Training Epoch 2, after Batch 0:
Allocated: 0.0012302398681640625 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 2, after Batch 1:
Allocated: 0.0012173652648925781 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 2, after Batch 2:
Allocated: 0.0012311935424804688 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 2, after Batch 3:
Allocated: 0.0012059211730957031 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 2, after Batch 4:
Allocated: 0.0012116432189941406 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 2, after Batch 0:
Allocated: 0.005806446075439453 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 2, after Batch 1:
Allocated: 0.00406646728515625 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 2, after Batch 2:
Allocated: 0.006463527679443359 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 2, after Batch 3:
Allocated: 0.002354145050048828 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 2, after Batch 4:
Allocated: 0.004904747009277344 GB
Cached:    0.09375 GB
EPOCH 2, TRAINING_LOSS (2.33304762840271, 0.16655649927292412), VALIDATION_LOSS (2.1567287921905516, 0.2492085923908138), DURATION 2.8533482551574707s
Quadro P2000
Memory Usage Training Epoch 3, after Batch 0:
Allocated: 0.0012235641479492188 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 3, after Batch 1:
Allocated: 0.0012049674987792969 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 3, after Batch 2:
Allocated: 0.0012340545654296875 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 3, after Batch 3:
Allocated: 0.0012340545654296875 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 3, after Batch 4:
Allocated: 0.0012211799621582031 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 3, after Batch 0:
Allocated: 0.005489826202392578 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 3, after Batch 1:
Allocated: 0.0030603408813476562 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 3, after Batch 2:
Allocated: 0.004867076873779297 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 3, after Batch 3:
Allocated: 0.004198551177978516 GB
Cached:    0.09375 GB
Memory Usage Validation Epoch 3, after Batch 4:
Allocated: 0.0038518905639648438 GB
Cached:    0.09375 GB
EPOCH 3, TRAINING_LOSS (1.9486108303070069, 0.10505638131828297), VALIDATION_LOSS (2.0732335805892945, 0.15639371664337126), DURATION 3.158864974975586s
Quadro P2000
Memory Usage Training Epoch 4, after Batch 0:
Allocated: 0.0012073516845703125 GB
Cached:    0.09375 GB
Quadro P2000
Memory Usage Training Epoch 4, after Batch 1:
Allocated: 0.0012192726135253906 GB
Cached:    0.09375 GB

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-5-c38815eca75e> in <module>
     17 
     18 
---> 19         score, y_pred1,y_pred2 = model(x1,x2) #train
     20         optimizer.zero_grad()
     21 

~\AppData\Local\Continuum\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    556             result = self._slow_forward(*input, **kwargs)
    557         else:
--> 558             result = self.forward(*input, **kwargs)
    559         for hook in self._forward_hooks.values():
    560             hook_result = hook(self, input, result)

<ipython-input-2-e9430b51afae> in forward(self, seq1, seq2)
     25         h0.normal_()
     26 
---> 27         _, last_hidden1 = self.encoder(X1,h0)
     28         encoded1 = last_hidden1[-1].repeat((len(X1),1,1))
     29 

~\AppData\Local\Continuum\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    556             result = self._slow_forward(*input, **kwargs)
    557         else:
--> 558             result = self.forward(*input, **kwargs)
    559         for hook in self._forward_hooks.values():
    560             hook_result = hook(self, input, result)

~\AppData\Local\Continuum\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\rnn.py in forward(self, input, hx)
    725         if batch_sizes is None:
    726             result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,
--> 727                              self.dropout, self.training, self.bidirectional, self.batch_first)
    728         else:
    729             result = _VF.gru(input, batch_sizes, hx, self._flat_weights, self.bias,

RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR


Thanks for the information.
I let the script run for 2400 epochs on a P100, but still couldn’t reproduce this error.
Unfortunately, I don’t have access to a Windows machine with this GPU to rerun the code.

Could you just for the sake of debugging disable cudnn via torch.backends.cudnn.enabled = False and rerun the code again?

It is running without dying, though it is running very very slowly (2-4min per epoch).

Hi, does this problem solved? When I tested several environments, this error occurs some specific environments.

  1. Windows OS (but I tested only Windows10)
  2. Using RNN
  3. RTX graphic card.

I had this problem in various CUDA versions, and I noticed that it appeared in TensorFlow 2.x as well as PyTorch. In my opinion, CUDA on Windows OS may have a problem with GPU memory management when parallel processing of RNN.