I get a sample batch of data from dataloader, I set batch size to 1. The image shape is
1 x 3 x 224 x 224, the label shape is 1 x 7 x 7 x 5. Now I tried to calculate the loss for one image, but i got the nan value, why? I also tried to train the network for whole batch, the loss is still nan. Thank you for reading.
face_data = FaceAnnoDataset(root_dir=path, img_dir ='image', anno_dir='label', txtfile='image.txt', transform=transforms.Compose([ transforms.ToPILImage(), transforms.Resize((224,224)), transforms.ToTensor(), transforms.Normalize([0.2341, 0.2388, 0.2622], [0.2210, 0.2150, 0.2543])]) ) train_loader = DataLoader(face_data, batch_size=1, shuffle=False, pin_memory=True, num_workers=2, collate_fn=collate_fn) # get one sample of batch, shape: 1 x 3 x 224 x 224 train_sample = next(iter(train_loader)) image = train_sample print(image.size()) anno = train_sample print(anno.size()) model = DetectionNet() # place model to GPU model.cuda() image = image.cuda() anno = anno.cuda() y_pred = model(image) y_pred = y_pred.permute(0,2,3,1) print(y_pred.size()) loss = loss_fn(y_pred, anno) print(loss) output: torch.Size([1, 3, 224, 224]) torch.Size([1, 7, 7, 5]) torch.Size([1, 7, 7, 5]) tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
def loss_fn(y_pred, y): loss = conf_regression_loss(y_pred, y) + bbox_regression_loss(y_pred, y) return loss def conf_regression_loss(y_pred, y, lamda=0.5): ''' y_pred: output of forward propagation, shape: batch x grid x grid x 5 y: ground truth, shape: batch x grid x grid x 5 lamda: parameter of loss function of loss_no_obj, as no_obj cells dominates, we need to decrease the loss, otherwise no_obj overpowers the loss return: confidence loss ''' # get the mask of actived grid cell, in which has an object according to ground # truth label mask = y[:,:,:,0] # shape: batch x grid x grid x 1 y_pred_c = y_pred[:,:,:,0] # if object exits in the cell loss_obj = torch.sum((mask * y_pred_c - mask)**2) # mask == y_c, here confidence is equavalent to mask # if object doesn't exit in the cell, we have to decrease # the loss as number of cells which doesn't contain an object is # much larger that cells do # get mask of no object where 1 indicates no object mask_no_obj = mask.clone() mask_no_obj[mask==0] = 1 mask_no_obj[mask==1] = 0 loss_no_obj = torch.sum((y_pred_c * mask_no_obj - mask)**2) loss_no_obj = loss_no_obj * lamda loss = loss_obj + loss_no_obj return loss def bbox_regression_loss(y_pred, y, lamda=5): mask = y[:,:,:,0] # loss of offset x, y loss_offset = torch.sum((mask * y_pred[:,:,:,1] - y[:,:,:,1])**2 +\ (mask * y_pred[:,:,:,2] - y[:,:,:,2])**2) # loss of width and height loss_w_h = torch.sum((mask * torch.sqrt(y_pred[:,:,:,3]) - torch.sqrt(y[:,:,:,3]))**2 + \ (mask * torch.sqrt(y_pred[:,:,:,3]) - torch.sqrt(y[:,:,:,3]))**2) loss = lamda * (loss_offset + loss_w_h) return loss