CUDA error: device-side assert triggered in multiclass segmentation task

I am working on an image segmentation tasks to segment lines on a football pitch. And here is my code to generate the training data which includes images and the groundtruth masks.

import cv2
import os
class NewLoader(Dataset):
    def __init__(self,image_path, transform=None):
        self.transform = transform
        self.mean = np.array([0.485, 0.456, 0.406])
        self.std = np.array([0.229, 0.224, 0.225])
        cls_list = list(LINE_IDX_MAP.keys())
        frames = [f for f in os.listdir(image_path) if ".jpg" in f]
        self.data = []
        self.n_samples = 0
        for frame in frames:
            frame_index = frame.split(".")[0]
            annotation_file = os.path.join(image_path, f"{frame_index}.json")
            if not os.path.exists(annotation_file):
                continue
            with open(annotation_file, "r") as f:
                groundtruth_lines = json.load(f)
            img_path = os.path.join(image_path, frame)
            if groundtruth_lines:
                self.data.append({
                    "image_path": img_path,
                    "annotations": groundtruth_lines,
                })
    def __getitem__(self, idx):
        item = self.data[idx]

        img = cv2.imread(item["image_path"])
        img = cv2.resize(img, (TARGET_SIZE[1],TARGET_SIZE[0]), interpolation=cv2.INTER_LINEAR)
        mask = np.zeros(img.shape[:-1], dtype=np.uint8)
        img = Image.fromarray(img)
#         img = np.asarray(img) 
#         img = (img - self.mean) / self.std
#         img = img.transpose((2, 0, 1))
        for class_number, class_ in enumerate(LINE_IDX_MAP):
            if class_ in item["annotations"].keys():
                key = class_
                line = item["annotations"][key]
                prev_point = line[0]
                for i in range(1, len(line)):
                    next_point = line[i]
                    cv2.line(mask,
                            (int(prev_point["x"] * mask.shape[1]), int(prev_point["y"] * mask.shape[0])),
                            (int(next_point["x"] * mask.shape[1]), int(next_point["y"] * mask.shape[0])),
                            class_number + 1,
                            2)
                    prev_point = next_point
        img = img_tfm(img)
        mask = mask_tfm(np.expand_dims(mask, axis=-1))
        return img, mask.squeeze()
    
    def __len__(self):
        return len(self.data)

I am using a custom dice loss and crossentropy loss but a few seconds after training starts i get this error

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA  to enable device-side assertions.

I read in another post that passing os.environ[‘CUDA_LAUNCH_BLOCKING’] = “1” into the script, it may help with identifying the error. And so far, it looks like the loss computations is the problem but it doesn’t completely say what.

Here is a snippet of my dice loss as well

class DiceBCELoss(nn.Module):
    def __init__(self, weight=None, size_average=True, include_entropy=True):
        super(DiceBCELoss, self).__init__()
        self.bce_losss = nn.CrossEntropyLoss()
        self.include_entropy = include_entropy

    def forward(self, inputs, targets, smooth=1):
        if self.include_entropy:
            BCE = self.bce_losss(inputs, targets)
        targets = F.one_hot(targets, num_classes=NUM_CLASSES).permute(0, 3, 1, 2).float()

        # flatten label and prediction tensors
        inputs = inputs.view(-1,NUM_CLASSES)
        targets = targets.contiguous().view(-1, NUM_CLASSES)
        
        # inputs = F.softmax(inputs, dim=1)

        intersection = (inputs * targets).sum()
        dice_loss = 1 - (2.*intersection + smooth)/(
            inputs.sum() + targets.sum() + smooth)
        if self.include_entropy:
            Dice_BCE = BCE + dice_loss
        else:
            Dice_BCE = dice_loss

        return Dice_BCE

Often the device assert it triggered in nn.CrossEntropyLoss when invalid indices are passed in the target. Make sure the target contains class indices in the range [0, nb_classes-1] in case the criterion is also failing in your code.

so my application has 27 classes which includes background, but the background isn’t really necessary at this point so just 26 classes representing the pitch lines. Taking a look at my targets it has this shape

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.uint8)

And when I run a single forward pass, the predicted mask has values like so;

tensor([[[17, 17, 17,  ..., 18, 17, 17],
         [22, 17, 17,  ..., 18, 17, 17],
         [22, 22, 18,  ..., 18, 18, 17],
         ...,
         [17, 17, 18,  ..., 18, 18, 18],
         [17, 17, 17,  ..., 18, 18, 18],
         [17, 17, 17,  ..., 18, 18, 18]]], device='cuda:0')

I still don’t see where the problem could be from.