Loss is nan, stopping training when training Mask-RCNN multi-class segmentation

Wassim_Sliti · May 24, 2021, 8:12pm

number of train data: 346
number of test data: 69
Epoch: [0] [0/346] eta: 0:35:20 lr: 0.000019 loss: -312.6024 (-312.6024) loss_classifier: 1.5789 (1.5789) loss_box_reg: 0.1299 (0.1299) loss_mask: -314.3485 (-314.3485) loss_objectness: 0.0266 (0.0266) loss_rpn_box_reg: 0.0106 (0.0106) time: 6.1275 data: 0.1599 max mem: 0
Loss is nan, stopping training
{‘loss_classifier’: tensor (nan, grad_fn = ), ‘loss_box_reg’: tensor (nan, grad_fn = ), ‘loss_mask’: tensor (nan, grad_fn = ), ’ tensor (nan, grad_fn = ), ‘loss_rpn_box_reg’: tensor (nan, grad_fn = )}
An exception has occurred, use% tb to see the full traceback.

SystemExit : 1

And this is the dataset code

class maskrcnn_Dataset(torch.utils.data.Dataset):

def __init__(self, root, transforms=None):
    self.root = root
    self.transforms = transforms
    # load all image files, sorting them to
    # ensure that they are aligned
    self.imgs = list(sorted(os.listdir(os.path.join(root, "images"))))
    self.masks = list(sorted(os.listdir(os.path.join(root, "masks"))))
    #self.class_masks = list(sorted(os.listdir(os.path.join(root, "SegmentationClass"))))

def __getitem__(self, idx):
    # load images ad masks
    img_path = os.path.join(self.root, "images", self.imgs[idx])
    x=self.imgs[idx].split('.')
    mask_path = os.path.join(self.root, "masks", self.masks[idx])
    #class_mask_path = os.path.join(self.root, "SegmentationClass", self.class_masks[idx])
    
    #read and convert image to RGB
    img = cv2.imread(img_path)
    mask_for_all=[]
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # note that we haven't converted the mask to RGB,
    # because each color corresponds to a different instance
    # with 0 being background
    # mask = Image.open(mask_path)
    mask_folder=os.path.join(self.root,"masks")
    source_mask = os.path.join(mask_folder, x[0])
    #print(os.listdir(source_mask))
    boxes = []
    xx=trier(os.listdir(source_mask))
    #print(xx)
    for file_name in xx:
        mask = Image.open(os.path.join(source_mask,file_name))
        mask = np.array(mask)
        mask_for_all.append(mask)
        obj_ids = np.unique(mask)
        obj_ids = obj_ids[1:]
        masks = mask == obj_ids[:, None, None]
        num_objs = len(obj_ids)
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

    
    
    
    num_objs=len(boxes)
    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    # there is only one class
    if(self.root.find("train")!=-1):
        #print("bisgltjf")
        
        labels =class_ids_train[class_ids_train_names.index(self.imgs[idx])]
        #print(labels)
    else:
        labels =class_ids_val[class_ids_val_names.index(self.imgs[idx])]
        #print('l3assba')
        
        
        
    
    #labels = np.array([])
    #for i in range(masks.shape[0]):
     #   labels = np.append(labels, (masks[i] * class_mask).max())
    
    labels = torch.as_tensor(labels, dtype=torch.int64)
    #print(boxes,":",labels)
    masks = torch.as_tensor(mask_for_all, dtype=torch.uint8)
    #print(labels)
    #print(masks)
    #print(masks.shape)

    image_id = torch.tensor([idx])
    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
    # suppose all instances are not crowd
    iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
    #print(img.shape)
    #print(self.imgs[idx])
    

    target = {}
    target["boxes"] = boxes
    #print(boxes)
    target["labels"] = labels
    #print(labels.shape)
    target["masks"] = masks
    #print(masks.shape)
    target["image_id"] = image_id
    #print(image_id.shape)
    target["area"] = area
    #print(area)
    target["iscrowd"] = iscrowd
    #print(iscrowd.shape)

    if self.transforms is not None:
        img, target = self.transforms(img, target)

    return img, target

def __len__(self):
    return len(self.imgs)

ptrblck · May 25, 2021, 4:59am

I’m not sure, how you are calculating the loss_mask, but do you expect it to be negative, as it’s a bit unusual? If not, I guess this might cause your model to diverge and create invalid outputs in the end.

Wassim_Sliti · May 25, 2021, 3:19pm

thank you for your replying, I do not expect it to be negative and I use these three helper functions
to train my model with this data set

from engine import train_one_epoch, evaluate
import utils

Wassim_Sliti · May 26, 2021, 9:53am

thank you for your replying, I do not expect it to be negative and I use these three helper functions
to train my model with this data set

from engine import train_one_epoch, evaluate
import utils