Loss becomes zero after a few dozen pictures

I am using Coco Dataset’s pictures and mask image that I crreated with the below script to do sematic segmentation.

Why is my cross entropy loss function returning zero after a few dozen pictures?

Screenshot from 2020-04-22 16-54-09


def make_palatte (classes):
	plt_dict = {0: ["backgorund", (0,0,0)]}

	all_colors_map = []
	palette = []

	for i in range(255):
		for ii in range(255):
			for iii in range(255):
				adding = [i, ii, iii]
				if adding ==[0,0,0] or adding == [255, 255, 255]:
					pass # ["backgorund", (0,0,0)] 
				all_colors_map.append(adding)
			

	distance = len(all_colors_map)/(len(classes)+10)  # buffer
	distance = math.floor(distance)
	for one_class in classes:
		#print(one_class)
		id = one_class["id"] #starts with 1 
		name = one_class["name"] # word
		color = all_colors_map[int(id)*distance]
		palette.extend(color)
		plt_dict[id] = [name, tuple(color)]

		# 1 already taken by background

	palette.extend([255,255,255])
	plt_dict[len(plt_dict)+2] =["ambiguous", (255,255,255)] 
	
	return plt_dict, palette

def mask_maker (palette_dict, img_id, height, width, palette, segmentation, export_dir):
	
	im = Image.new("P", (height, width), color=(0,0,0)) # 0 0 0 >> background
	im.putpalette(palette)
	d =  ImageDraw.Draw(im)
	if len(segmentation) == 0:
		im.save(export_dir)
		return

	for segment in segmentation: #a["segmentation"] = xy coordinates
		#print(segment["segmentation"])
		xy_tup_list = []
		category_id = int(segment["category_id"])
		if len(segment["segmentation"]) == 0:
			im.save(export_dir)
			return 
		for idx, point in enumerate(segment["segmentation"][0]):
			if idx % 2 == 0: #STARTS WITH 0
				x = point
			if idx % 2 !=0:
				y = point
				xy_tup_list.append((x, y))
				x = None
				y = None		
		d.polygon(xy_tup_list, fill=category_id)
		#d.polygon(xy, fill=category_id)
	im.save(export_dir)

I randomly picked 1000train pictures and 600 val from COCO 2014 dataset excluding ones that have iscrowd=1. I am trying to do semantic segmentation on those 90 coco classes+ background.

I am using Pyramid Scenen Parsing Network which I pretty much copied from here except for the dataloader. https://github.com/YutaroOgawa/pytorch_advanced/blob/master/3_semantic_segmentation/3-7_PSPNet_training.ipynb.
This original model did fine tuning with VOC pascal dataset. but i am trying to use COCO datset instead.

To create the mask, I basically drew polygon with PIL’s drawimage and assigned value by using “P” mode and original color pallet that has 90 colors and corresponding numbers.

below is my train function

def train_model(net, dataloaders_dict, criterion, scheduler, optimizer, num_epochs):


    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("available:", device)
    net.to(device)

    torch.backends.cudnn.benchmark = True


    num_train_imgs = len(dataloaders_dict["train"].dataset)
    num_val_imgs = len(dataloaders_dict["val"].dataset)
    batch_size = dataloaders_dict["train"].batch_size


    iteration = 1
    logs = []

    # multiple minibatch
    batch_multiplier = 3

    for epoch in range(num_epochs):

        t_epoch_start = time.time()
        t_iter_start = time.time()
        epoch_train_loss = 0.0  
        epoch_val_loss = 0.0  

        print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  
                scheduler.step()  
                optimizer.zero_grad()
                print('(train)')

            else:
                if((epoch+1) % 5 == 0):
                    net.eval()   
                    print('-------------')
                    print('(val)')
                else:
                    continue

            count = 0  # multiple minibatch
            for imges, anno_class_imges in dataloaders_dict[phase]:
                
                if imges.size()[0] == 1:
                    continue


                imges = imges.to(device)
                anno_class_imges = torch.squeeze(anno_class_imges)
                anno_class_imges = anno_class_imges.to(device)

                
                # multiple minibatch
                if (phase == 'train') and (count == 0):
                    optimizer.step()
                    optimizer.zero_grad()
                    count = batch_multiplier

          
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(imges)
                    loss = criterion(
                        outputs, anno_class_imges.long()) / batch_multiplier
                    print("loss:  " loss)
                    
                    # 
                    if phase == 'train':
                        loss.backward()  #
                        count -= 1  # multiple minibatch

                        if (iteration % 10 == 0):  
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('iteration {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item()/batch_size*batch_multiplier, duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item() * batch_multiplier
                        iteration += 1


                    else:
                        epoch_val_loss += loss.item() * batch_multiplier

       
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss/num_train_imgs, epoch_val_loss/num_val_imgs))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()


        log_epoch = {'epoch': epoch+1, 'train_loss': epoch_train_loss /
                     num_train_imgs, 'val_loss': epoch_val_loss/num_val_imgs}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")


    torch.save(net.state_dict(), 'weights/pspnet50_' +
               str(epoch+1) + '.pth')
num_epochs = 30
torch.cuda.empty_cache()
train_model(net, dataloaders_dict, criterion, scheduler, optimizer, num_epochs=num_epochs)

Could you explain your use case a bit, please?
How did you create the masks? Which values do they have and which shape?
What kind of model are you using and how many data samples are you using?

Thank you so much. I added some more information.

This is one of my mask images. What is kind of curious is that I have to do squeeze to get rid of one dimension before feeding it to GPU, which the original model did not have to do. Could this be related?

That sounds weird.
Which dimension are you squeezing and which shape does the target have before squeezing it?
The expected shape for a multi-class segmentation for the target is [batch_size, height, width] and the target tensor should contain the class indices in the range [0, nb_classes-1], if you are using nn.CrossEntropyLoss.

This is pre-squeeze size of my target data>> targets of size: : [4, 1, 300, 300] squeeze reduces this to [4,300,300]
also I noticed that the size of the target image does not match the roiginal image.

Also a side question. Is there any way to make masks out of xy coordinates? do people code tehir own script everytime?
COCO_train2014_000000581097

What min and max values does the target contain?
The shape looks OK after you squeeze it, if target is a LongTensor and contains the class indices in the range [0, nb_classes-1].

I would guess that you could find some good implementations in this discussion board or alternatively for numpy (which should be easily portable to PyTorch) :wink:

I dont know how to interpret these.
print("max: ", torch.max(anno_class_imges), "min: ", torch.min(anno_class_imges))

max: tensor(0.3294) min: tensor(0.)

this is my datatransoform


    def __init__(self, input_size):
        self.data_transform = transforms.Compose([
                    transforms.Resize((input_size, input_size)),
                    transforms.ToTensor()
                ])
        

Are you transforming the target image with the provided transformation or just the data tensor?
In the former case, you will get a normalized target, which is wrong for a segmentation use case.
You would have to map the colors of your target to class indices.
E.g. “blue” -> class0, “red” -> class1 etc.
Here is an example on how to create a color mapping.

the guy I copied from yes does normalize the data, but I did not, so I just commented out those lines… I just made PIL image into tensors. I did map 90 colors + background in the range of 0, 0, 0 to 255, 255, 255 and I did PIL’s putlpalette to each mask image I was creating.

Should my torch max and min values have integer values, instead of float? I thought I assigned numbers ranging from 0 to 91 in my color map. Also, you are using cmap in matplolib instead of PIL, is using PIL not reccomended?

Yes, the target should be a LongTensor. The min values should be 0, while the max value should be nb_classes-1.

No, use whatever works for you. I’m probably more familiar with the matplotlib color map than with PIL, but don’t have any recommendations.

thank you so much.You have been very kind. I’ll look into my code a little bit more.

I normalized the target and original image data and it somehow solved the problem which I don’t quiet understand. closing the issue.