RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR during backpropagation

I am still relatively new to building NNs and I am currently trying to build a CNN - GRU model.

My model:

class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.conv_block = nn.Sequential(
            nn.Conv2d(
                in_channels = 1,         #Number of input channels; spectrograms will be treated as grayscale images
                out_channels = 16,       #Number of filters in convolutional layer
                kernel_size = 5,         #Standard value for this process
                stride = 2,
            ),
            nn.BatchNorm2d(16),
            nn.AvgPool2d(
                kernel_size = 2, 
                stride = 2
            ),
            nn.BatchNorm2d(16),
            nn.LeakyReLU(),
            nn.Flatten(start_dim=1)
        )

        self.layernorm_block = nn.Sequential(
            nn.LayerNorm(500),
            nn.LeakyReLU(),
        )
        
        self.gru_1 = nn.GRU(16 * 4 * 124, 500, num_layers = 1)
        self.gru_2 = nn.GRU(500, 500, num_layers = 1)
        self.gru_3 = nn.GRU(500, 500, num_layers = 1)
        
        self.linear = nn.Linear(500,1)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, input_data):           #Telling pyTorch how to pass data from one layer to the next
        x = self.conv_block(input_data)
        x, h0 = self.gru_1(x)
        x = self.layernorm_block(x)
        x, h0 = self.gru_2(x, h0)
        x = self.layernorm_block(x)
        x, h0 = self.gru_3(x, h0)
        x = self.layernorm_block(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions

I am getting this error during loss.backward() which from reading other posts might be a memory problem but I haven’t really found any answers and was wondering if someone understands what is happening and how to solve it.

C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [4,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [5,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [6,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [7,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [8,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [9,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [10,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [11,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [12,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [13,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [14,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [15,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [16,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [17,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [18,0,0] Assertion `t >= 0 && t < n_classes` failed.
C:\cb\pytorch_1000000000000\work\aten\src\ATen\native\cuda\Loss.cu:240: block: [0,0,0], thread: [19,0,0] Assertion `t >= 0 && t < n_classes` failed.
Traceback (most recent call last):
  File "...\Desktop\Code\Special Topics Project\DNN.py", line 212, in <module>
    train(network, train_dataloader, loss_fn, optimizer, device, EPOCHS)            #Training the model
  File "...\Desktop\Code\Special Topics Project\DNN.py", line 164, in train
    train_single_epoch(model, data_loader, loss_fn, optimizer, device)
  File "...\Code\Special Topics Project\DNN.py", line 156, in train_single_epoch
    loss.backward()
  File "...\anaconda3\lib\site-packages\torch\_tensor.py", line 487, in backward
    torch.autograd.backward(
  File "...\anaconda3\lib\site-packages\torch\autograd\__init__.py", line 200, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR

This is my system and environment information.

PyTorch version: 2.0.0
Is debug build: False
CUDA used to build PyTorch: 11.7
ROCM used to build PyTorch: N/A

OS: Microsoft Windows 10 Pro
GCC version: (Rev6, Built by MSYS2 project) 12.2.0
Clang version: Could not collect
CMake version: Could not collect
Libc version: N/A

Python version: 3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)] (64-bit runtime)
Python platform: Windows-10-10.0.19045-SP0
Is CUDA available: True
CUDA runtime version: 11.7.99
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration: GPU 0: NVIDIA GeForce RTX 3060
Nvidia driver version: 535.98
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True

CPU:
Architecture=9
CurrentClockSpeed=3500
DeviceID=CPU0
Family=205
L2CacheSize=8192
L2CacheSpeed=
Manufacturer=GenuineIntel
MaxClockSpeed=3500
Name=13th Gen Intel(R) Core(TM) i5-13600K
ProcessorType=3
Revision=

Versions of relevant libraries:
[pip3] numpy==1.24.3
[pip3] torch==2.0.0
[pip3] torchaudio==2.0.0
[pip3] torchvision==0.15.0
[conda] blas                      1.0                         mkl
[conda] mkl                       2021.4.0           haa95532_640
[conda] mkl-service               2.4.0            py39h2bbff1b_0
[conda] mkl_fft                   1.3.1            py39h277e83a_0
[conda] mkl_random                1.2.2            py39hf11a4ad_0
[conda] numpy                     1.24.3           py39hf95b240_0
[conda] numpy-base                1.24.3           py39h005ec55_0
[conda] pytorch                   2.0.0           py3.9_cuda11.7_cudnn8_0    pytorch
[conda] pytorch-cuda              11.7                 h67b0de4_0    pytorch
[conda] pytorch-mutex             1.0                        cuda    pytorch
[conda] torchaudio                2.0.0                    pypi_0    pypi
[conda] torchvision               0.15.0                   pypi_0    pypi

P.S: As mentioned, I am new to building NNs and this is my first try at a hybrid model so any suggestions are welcomed.

The cuDNN issue is a victim here, as your code already fails in the loss calculation.
I guess you are using nn.CrossEntropyLoss or nn.NLLLoss, so make sure the target contains values in the range [0, nb_classes-1].

Yes, I am using nn.crossEntropyLoss. I kinda understand but at the same time don’t, are you referring to the actual shape of the target tensor from the dataloader? I just realized I didn’t include my training function but my data has 2 classes. My target tensor has a shape of [20] as my batch size = 20 and each value is a class label value.

def create_data_loader(dataset, test_split, batch_size):
    random_seed= 42
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(test_split * dataset_size))
    np.random.seed(random_seed)
    np.random.shuffle(indices)
    train_indices, test_indices = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_indices)
    test_sampler = SubsetRandomSampler(test_indices)
    train_dataloader = DataLoader(dataset, batch_size = batch_size, sampler = train_sampler)
    test_dataloader = DataLoader(dataset, batch_size = batch_size, sampler = test_sampler)
    return train_dataloader, test_dataloader

def train_single_epoch(model, data_loader, loss_fn, optimizer, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        #Calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        #Backpropagate error and update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"loss: {loss.item()}")

def train(model, data_loader, loss_fn, optimizer, device, epochs):           #Training the model
    for i in range(epochs):
        print(f"Epoch {i + 1}")
        train_single_epoch(model, data_loader, loss_fn, optimizer, device)
        print("--------------------------")
    print("Finished training")

No, I’m asking about the actual values of the target as these are out of bounds as seen here:

criterion = nn.CrossEntropyLoss()

batch_size, nb_classes = 16, 10
device = "cuda"
output = torch.randn(batch_size, nb_classes, device=device, requires_grad=True)
target = torch.randint(0, nb_classes, (batch_size,), device=device)

# works
loss = criterion(output, target)

# fails since target is out of bounds
target[0] = 10
loss = criterion(output, target)
print(loss)
# RuntimeError: CUDA error: device-side assert triggered
# ../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.

# following CUDA operations now fail as well
a = torch.randn(10, device=device)
# RuntimeError: CUDA error: device-side assert triggered

Once the assert is triggered the CUDA context is corrupt and following CUDA operations will re-raise an error.

Got it to work. Thank you for the help, your example helped a lot.