RuntimeError: cuDNN error: CUDNN_STATUS_MAPPING_ERROR

Animesh_Kumar_Paul · September 29, 2019, 5:02pm

My input shape without batch size = 87 * 61 * 73 * 61

Here, 87 = time points, [61,73,61] = 3D data.

Trying to run the below code using 4 GPU.

class cnn_lstm(torch.nn.Module):

    def __init__(self):
        super(CNN, self).__init__()
        self.cnn1 = nn.Sequential(
            nn.Conv3d( 1, 4, kernel_size=(5,5,5), padding=(2, 2, 2), bias=False),
            nn.ReLU(inplace=True),
        ).cuda(0)
        
        self.cnn2 = nn.Sequential(
            nn.Conv3d( 4, 8, kernel_size=(5,5,5) , padding=(2, 2, 2), bias=False),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=2),
            nn.ReLU(inplace=True),
        ).cuda(1)
        self.rnn = nn.LSTM(input_size=8 * 30 * 36 * 30, hidden_size=500, num_layers=3, batch_first=True).cuda(2)
        self.classifier = nn.Sequential(
            #nn.Dropout(),
            nn.Linear(500, 100),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(100, classes),
        ).cuda(3)

    def forward(self, x):
        batch_size, timesteps, C, H, W = x.size()
        c_in = x.contiguous().view(batch_size * timesteps, C, H, W)
        c_in = c_in.view(-1, 1, C, H, W).float()
       
        c_out = self.cnn1(c_in.cuda(0))
        c_out = self.cnn2(c_out.cuda(1))
        c_out = c_out.view(-1, 8 * 30 * 36 * 30)
      
        r_in = c_out.view(batch_size, timesteps, -1)
        r_out, (h_n, h_c) = self.rnn(r_in.cuda(2))

        r_out2 = self.classifier(r_out[:, -1, :].cuda(3))
        return F.log_softmax(r_out2, dim=1)


trainloader = D.DataLoader(ds_train, batch_size=2, shuffle=True, num_workers=0)
validloader = D.DataLoader(ds_valid, batch_size=2, shuffle=True, num_workers=0)
testloader = D.DataLoader(ds_test, batch_size=2, shuffle=False, num_workers=0)

model = cnn_lstm()
torch.cuda.empty_cache()
optimizer = optim.SGD(model.parameters(), lr=0.001)

train_losses = []
valid_losses = []
avg_train_losses = []
avg_valid_losses = [] 
	 
for epoch in range(n_epochs):
	running_loss = 0
	model.train()
	for data, label in trainloader:
		 data = data.float()
		 y_hat  = model(data)
		 data = data.cpu()
		 loss = criterion(y_hat.cuda(3), label.cuda(3)).cuda(3)
		
		 optimizer.zero_grad()
		 loss.backward()
		 optimizer.step()
		 running_loss += loss.item()

Get Error:

    loss.backward()
  File "/home/.../lib/python3.6/site-packages/torch/tensor.py", line 118, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/.../lib/python3.6/site-packages/torch/autograd/__init__.py", line 93, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_MAPPING_ERROR

Torch Version: ‘1.2.0’
Cuda Version: 10.1.168

Please help. how can I fix the error: CUDNN_STATUS_MAPPING_ERROR?

BestJuly · December 9, 2019, 3:10pm

I find the same error log as you. It is strange that when I disabled cuDNN, the error disapeared and I can train my codes sucessfully. I think the bug may lie in the cuDNN.

ptrblck · December 10, 2019, 7:16pm

I tried to reproduce this error using this code:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset


class cnn_lstm(torch.nn.Module):

    def __init__(self):
        super(cnn_lstm, self).__init__()
        self.cnn1 = nn.Sequential(
            nn.Conv3d( 1, 4, kernel_size=(5,5,5), padding=(2, 2, 2), bias=False),
            nn.ReLU(inplace=True),
        ).cuda(0)
        
        self.cnn2 = nn.Sequential(
            nn.Conv3d( 4, 8, kernel_size=(5,5,5) , padding=(2, 2, 2), bias=False),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=2),
            nn.ReLU(inplace=True),
        ).cuda(1)
        self.rnn = nn.LSTM(input_size=8 * 30 * 36 * 30, hidden_size=500, num_layers=3, batch_first=True).cuda(2)
        self.classifier = nn.Sequential(
            #nn.Dropout(),
            nn.Linear(500, 100),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(100, classes),
        ).cuda(3)

    def forward(self, x):
        batch_size, timesteps, C, H, W = x.size()
        c_in = x.contiguous().view(batch_size * timesteps, C, H, W)
        c_in = c_in.view(-1, 1, C, H, W).float()
       
        c_out = self.cnn1(c_in.cuda(0))
        c_out = self.cnn2(c_out.cuda(1))
        c_out = c_out.view(-1, 8 * 30 * 36 * 30)
      
        r_in = c_out.view(batch_size, timesteps, -1)
        r_out, (h_n, h_c) = self.rnn(r_in.cuda(2))

        r_out2 = self.classifier(r_out[:, -1, :].cuda(3))
        return F.log_softmax(r_out2, dim=1)


classes = 10
N = 1
ds_train = TensorDataset(
    torch.randn(N, 87, 61, 73, 61),
    torch.randint(0, classes, (N,))
)

trainloader = DataLoader(ds_train, batch_size=2, shuffle=True, num_workers=0)


model = cnn_lstm()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
criterion = nn.NLLLoss()

train_losses = []
avg_train_losses = []

n_epochs = 1	 
for epoch in range(n_epochs):
	running_loss = 0
	model.train()
	for data, label in trainloader:
		 data = data.float()
		 y_hat  = model(data)
		 data = data.cpu()
		 loss = criterion(y_hat.cuda(3), label.cuda(3)).cuda(3)
		
		 optimizer.zero_grad()
		 loss.backward()
		 optimizer.step()
		 running_loss += loss.item()

but it works fine with PyTorch built from master and cudnn7.6.5.32.
Could you update your PyTorch to the latest stable release and let me know, how to reproduce this issue?

acools · March 8, 2021, 3:47pm

For anyone still having this error. I ran into the same problem today. The reason was because I was using multiple GPUS,I “fixed” it by limiting the amount of GPU’s to 1. So this probably has something to do with data being spread out on different GPU’s