Error message received 'RuntimeError: CUDA error: device-side assert triggered'

Prasad · August 8, 2020, 11:01am

I am trying to implement pretrained densenet121 on a image classification task with 789 images and 3 labels and received an error. Below is my code.

# Import libraries
import pandas as pd # For importing dataset
import numpy as np # For matrix operation
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import cv2
import matplotlib.pyplot as plt
import torchvision
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision import transforms, models
import copy
import tqdm
from PIL import Image

%matplotlib inline

mix_dataset_dir = '/kaggle/input/complete-mix-dataset/Mix_images'
mix_dataset_files = os.listdir(mix_dataset_dir)

mix_dataset_files

Below are few images names I have shared for reference

['Teenagers (95).jpg',
 'Adults (254).jpg',
 'Teenagers (108).jpg',
 'Teenagers (175).jpg',
 'Teenagers (126).jpg',

Code continued

class MixDataset(Dataset):
    def __init__(self, file_list, dir, mode='train', transform = None):
        self.file_list = file_list
        self.dir = dir
        self.mode= mode
        self.transform = transform
            
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, idx):
        img = Image.open(os.path.join(self.dir, self.file_list[idx])).convert('RGB')
        if self.mode == 'train':
            if 'Adults' in self.file_list[idx]:
                self.label = 1
            elif 'Teenagers' in self.file_list[idx]:
                self.label = 2
            else:
                self.label = 3
        if self.transform:
            img = self.transform(img)
        if self.mode == 'train':
            img = img.numpy()
            return img.astype('float32'), self.label
        else:
            img = img.numpy()
            return img.astype('float32'), self.file_list[idx]

data_transform = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.ColorJitter(),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.Resize((128,128)),
    transforms.ToTensor()
])

mix = MixDataset(mix_dataset_files, mix_dataset_dir , transform = data_transform)

densenet_model = models.densenet121(pretrained = True)
densenet_model

for param in densenet_model.parameters():
    param.requires_grad = True
    
from collections import OrderedDict
classifier = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(1024, 512)),
    ('relu1', nn.ReLU()),
    ('fc2', nn.Linear(512, 256)),
    ('relu2', nn.ReLU()),
    ('fc3', nn.Linear(256, 3)),
    ('output', nn.LogSoftmax(dim = 1))
]))

densenet_model.classifier = classifier

criterion = nn.NLLLoss()

optimizer = optim.Adam(densenet_model.classifier.parameters(), lr = 0.003)

scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 200, 300], gamma=0.5)

train_loader = DataLoader(mix, batch_size=32, shuffle = True)
densenet_model.to('cuda')
epochs = 3
itr = 1
p_itr = 200
densenet_model.train()
total_loss = 0
loss_list = []
acc_list = []
for epoch in range(epochs):
    for samples, labels in train_loader:
        samples, labels = samples.to('cuda'), labels.to('cuda')
        optimizer.zero_grad()
        output = densenet_model(samples)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        scheduler.step()
        
        if itr%p_itr == 0:
            pred = torch.argmax(output, dim=1)
            correct = pred.eq(labels)
            acc = torch.mean(correct.float())
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, acc))
            loss_list.append(total_loss/p_itr)
            acc_list.append(acc)
            total_loss = 0
            
        itr += 1

plt.plot(loss_list, label='loss')
plt.plot(acc_list, label='accuracy')
plt.legend()
plt.title('training loss and accuracy')
plt.show()

Below is the complete error log which I am getting

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-9-0e23ed2d21f0> in <module>
     14         output = densenet_model(samples)
     15         loss = criterion(output, labels)
---> 16         loss.backward()
     17         optimizer.step()
     18         total_loss += loss.item()

/opt/conda/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    196                 products. Defaults to ``False``.
    197         """
--> 198         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    199 
    200     def register_hook(self, hook):

/opt/conda/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     92         grad_tensors = list(grad_tensors)
     93 
---> 94     grad_tensors = _make_grads(tensors, grad_tensors)
     95     if retain_graph is None:
     96         retain_graph = create_graph

/opt/conda/lib/python3.7/site-packages/torch/autograd/__init__.py in _make_grads(outputs, grads)
     34                 if out.numel() != 1:
     35                     raise RuntimeError("grad can be implicitly created only for scalar outputs")
---> 36                 new_grads.append(torch.ones_like(out, memory_format=torch.preserve_format))
     37             else:
     38                 new_grads.append(None)

RuntimeError: CUDA error: device-side assert triggered

I have also gone through this link https://towardsdatascience.com/cuda-error-device-side-assert-triggered-c6ae1c8fa4c3 which fix the same problem I am facing and incorporated the changes but this thing also does not help.

Please let me know where am I going wrong.

Also I am working on kaggle kernels.

user_123454321 · August 8, 2020, 10:47pm

Shouldn’t the labels be 0 indexed ? For 3 outputs in the model you need to have labels from 0-2. But you are returning 3 as well (not adults nor teenagers). Also, when you get this uninformative cuda error, try running on the cpu (with a small batch size for a few batches) as my experience is the cpu errors are much more informative. And when everything runs smoothly, you can move to gpu.

Prasad · August 9, 2020, 6:01am

I have incorporated both the changes you suggest. I have changed the label to 0,1,2 and I am currently running it on cpu, but now I am getting another error which I am not able to understand.

/opt/conda/lib/python3.7/site-packages/PIL/Image.py:961: UserWarning: Palette images with Transparency expressed in bytes should be converted to RGBA images
  "Palette images with Transparency expressed in bytes should be "

Could you please help.

user_123454321 · August 9, 2020, 7:40am

I think you have a palletted image with an alpha image but you are converting to RGB .convert('RGB') (first line inside getitem), and it suggests you to convert to RGBA (A for the alpha channel) to not lose information.

Prasad · August 9, 2020, 9:09am

If I converted RGB to RGBA then I will have 4 channels as input to the first layer, but I am using pretrained denset121 which have an input requirement of 3 channels. Could you please let me know how do I handle this problem of channel mismatch?
Thanks in Advance.

user_123454321 · August 9, 2020, 10:30am

In most cases, alpha channel is usually not needed for the model to work. So RGB is probably sufficient in your case as well. To be sure try to plot the image once you get from your the dataloader and verify that they have not lost lot of information (for the model to discriminate).