Hello there. I’m new to PyTorch and I’m trying to utilize the ENet NN (from github) to classify underwater images from SUIM dataset and evaluate its performance. Through this post I tried to implement the class index mapping of the masks but when I start training the network, some specific masks cause a crash on cuda. I’ve managed to isolate some masks that cause this strange behaviour, as described below.
Follow the main snippets and code adapted from the previously mentioned post:
import torch.nn.functional as F
from collections import OrderedDict
from train import Train
from test import Test
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms.functional as TF
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.transforms as tvtransforms
import transforms as ext_transforms
import torch.optim.lr_scheduler as lr_scheduler
import os
import numpy as np
import matplotlib.pyplot as plt
import glob
from PIL import Image
from enet import ENet
from iou import IoU
import utils
from torchvision import models
class CustomDataset(Dataset):
def __init__(self, image_paths, target_paths, transform=None, train=True):
self.image_paths = image_paths
self.target_paths = target_paths
self.transform = transform
self.img = None
self.msk = None
self.mapping = {
(0, 0, 0) : 0, # 'Background'
(0, 0, 255) : 1, # 'Human Divers'
(0, 255, 0) : 2, # 'Aquatic Plants and Sea-Grass'
(0, 255, 255) : 3, # 'Wrecks and Ruins'
(255, 0, 0): 4, # 'Robots'
(255, 0, 255) : 5, # 'Reefs and Intertebrates'
(255, 255, 0) : 6, # 'Fishs and Vertebrates'
(255, 255, 255) : 7 # 'Sea-Floor and Rocks'
}
def mask_to_class_rgb(self, mask):
mask = np.array(mask)
mask = mask[...,:3]
mask = torch.from_numpy(mask)
mask = torch.squeeze(mask)
print('Unique values in rgb: ', torch.unique(mask))
class_mask = mask
class_mask = class_mask.permute(2, 0, 1).contiguous()
h, w = class_mask.shape[1], class_mask.shape[2]
mask_out = torch.empty(h, w, dtype=torch.long)
for i in self.mapping:
idx = (class_mask == torch.tensor(i, dtype=torch.uint8).unsqueeze(1).unsqueeze(2))
validx = (idx.sum(0) == 3)
mask_out[validx] = torch.tensor(self.mapping[i], dtype=torch.long)
print('Unique values mapped: ', torch.unique(mask_out))
return mask_out
def __getitem__(self, index):
img = Image.open(self.image_paths[index]).convert("RGB")
msk = Image.open(self.target_paths[index])
img_new = tvtransforms.Resize((std_size, std_size))(img)
mask_new = tvtransforms.Resize((std_size, std_size), Image.NEAREST)(msk)
if self.transform is not None:
img_new = self.transform(img_new)
img_new = img_new.float()
else:
img_new = img_new.float()
norm = tvtransforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
img_new = norm(img_new)
mask_new = self.mask_to_class_rgb(mask_new)
return img_new, mask_new
def __len__(self):
return len(self.image_paths)
This is the training snippet:
best_miou = 0
if 'start_epoch' in globals():
start_epoch = start_epoch
else:
start_epoch = 0
for epoch in range(start_epoch, num_epochs):
print(">>>> [Epoch: {0:d}] Training".format(epoch))
my_lr_scheduler.step()
epoch_loss, (iou, miou) = train.run_epoch(True) # Decides whether to print loss at each step
print(">>>> [Epoch: {0:d}] Avg. loss: {1:.4f} | Mean IoU: {2:.4f}".
format(epoch, epoch_loss, miou))
if (epoch + 1) % 10 == 0 or epoch + 1 == num_epochs:
print(">>>> [Epoch: {0:d}] Validation".format(epoch))
loss, (iou, miou) = val.run_epoch(True)
print(">>>> [Epoch: {0:d}] Avg. loss: {1:.4f} | Mean IoU: {2:.4f}".
format(epoch, loss, miou))
# Print per class IoU on last epoch or if best iou
if epoch + 1 == num_epochs or miou > best_miou:
for key, class_iou in zip(class_encoding.keys(), iou):
print("{0}: {1:.4f}".format(key, class_iou))
# Save the model if it's the best thus far
if miou > best_miou:
print("\nBest model thus far. Saving...\n")
best_miou = miou
utils.save_checkpoint_mod(net, optimizer, epoch + 1, best_miou, 'teste', 'save')
Here is the output:
>>>> [Epoch: 0] Training
Unique values in rgb: tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56,
59, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 72, 73, 74,
75, 76, 80, 81, 83, 84, 86, 87, 91, 93, 94, 95, 97, 119,
161, 162, 166, 167, 170, 171, 172, 175, 176, 177, 178, 179, 182, 183,
184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
198, 199, 200, 201, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
215, 216, 217, 218, 219, 221, 222, 223, 224, 225, 226, 227, 228, 229,
230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243,
244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255],
dtype=torch.uint8)
Unique values mapped: tensor([ 0, 42915137138293276, 43204295812155417,
..., 66953201512118048, 67517152192928824,
67519561669712197])
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-59-80f00525c548> in <module>()
10 my_lr_scheduler.step()
11
---> 12 epoch_loss, (iou, miou) = train.run_epoch(True) # Decides whether to print loss at each step
13
14 print(">>>> [Epoch: {0:d}] Avg. loss: {1:.4f} | Mean IoU: {2:.4f}".
4 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
2218 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2219 elif dim == 4:
-> 2220 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2221 else:
2222 # dim == 3 or dim > 4
IndexError: Target 52516180046732819 is out of bounds.
I’m attaching the isolated mask that causes this behaviour. Other masks that are similar to this one cause equal outcome. Also, I’ve edited this mask and made it entirely black, which didn’t crash during training.
Any help would be greatly appreciated!