Understanding how one can output segmented image after training a network for semantic segmentation

Hello! I am having a hard time understanding the partial result I got after trying to train my network for semantic segmentation. The flow of my work goes from:

  1. taking RGB images and feeding it to my network
  2. Computing the loss function

I noticed I have a terrible loss so I tried printing out the tensor values of one image that has gone through my network and it counterpart mask. I discovered that my loss was so big because:

  1. The tensor values for my image is ranging from 0-255
  2. The tensor value of the counterpart mask is 0-60 (0 for background and 1-60 for my 60 classes)

Right now I wanted to understand how semantic segmentation works. These are the questions I would like to raise:

  1. Why is my output image tensor values in 0-255 range not 0-60?
  2. How can I show predictions after I train (I would want my code to somehow show the original image together with its segmented output)
BATCH_SIZE = 10
EPOCHS = 1

def train(model):
  model.train()
  for epoch in range(EPOCHS):
      for i in tqdm(range(0, len(img_all), BATCH_SIZE)): 
          batch_img_all = img_all[i:i+BATCH_SIZE].view(-1, 3, 224, 224)
          batch_mask_all = mask_all[i:i+BATCH_SIZE].view(-1, 1, 224, 224)
        
          model.zero_grad()

          outputs = model(batch_img_all)
          loss = loss_function(outputs, batch_mask_all)
          loss.backward()
          optimizer.step()    # Does the update

  print(f"Epoch: {epoch}. Loss: {loss}")
  
  return batch_img_all, batch_mask_all

IMG, MSK = train(model)

I am raising these questions because I had read it here An overview of semantic image segmentation. (jeremyjordan.me) that the " goal is to take either a RGB color image (height×width×3height×width×3) or a grayscale image (height×width×1height×width×1) and output a segmentation map where each pixel contains a class label represented as an integer (height×width×1height×width×1)". Again I didnt have this kind of output because instead of labeling my images’ pixel by a class label, I got values from 0-255. Can anyone help me to look where I might made a mistake? This is the network architecture that I use by the way:

###################################################################################################
#ESNet: An Efficient Symmetric Network for Real-time Semantic Segmentation
#Paper-Link: https://arxiv.org/pdf/1906.09826.pdf
###################################################################################################

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torchsummary import summary

class DownsamplerBlock(nn.Module):
    def __init__(self, ninput, noutput):
        super().__init__()

        self.conv = nn.Conv2d(ninput, noutput-ninput, (3, 3), stride=2, padding=1, bias=True)
        self.pool = nn.MaxPool2d(2, stride=2)
        self.bn = nn.BatchNorm2d(noutput, eps=1e-3)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, input):
        x1 = self.pool(input)
        x2 = self.conv(input)

        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])

        output = torch.cat([x2, x1], 1)
        output = self.bn(output)
        output = self.relu(output)
        return output

class UpsamplerBlock (nn.Module):
    def __init__(self, ninput, noutput):
        super().__init__()

        self.conv = nn.ConvTranspose2d(ninput, noutput, 3, stride=2, padding=1, output_padding=1, bias=True)
        self.bn = nn.BatchNorm2d(noutput, eps=1e-3)

    def forward(self, input):

        output = self.conv(input)
        output = self.bn(output)

        return F.relu(output)
		
class FCU(nn.Module):
    def __init__(self, chann, kernel_size,dropprob, dilated): 
        """
        Factorized Convolution Unit
        """     
        super(FCU,self).__init__()

        padding = int((kernel_size-1)//2) * dilated

        self.conv3x1_1 = nn.Conv2d(chann, chann, (kernel_size,1), stride=1, padding=(int((kernel_size-1)//2)*1,0), bias=True)

        self.conv1x3_1 = nn.Conv2d(chann, chann, (1,kernel_size), stride=1, padding=(0,int((kernel_size-1)//2)*1), bias=True)

        self.bn1 = nn.BatchNorm2d(chann, eps=1e-03)

        self.conv3x1_2 = nn.Conv2d(chann, chann, (kernel_size,1), stride=1, padding=(padding,0), bias=True, dilation = (dilated,1))

        self.conv1x3_2 = nn.Conv2d(chann, chann, (1,kernel_size), stride=1, padding=(0,padding), bias=True, dilation = (1, dilated))

        self.bn2 = nn.BatchNorm2d(chann, eps=1e-03)
        
        self.relu = nn.ReLU(inplace = True)
        self.dropout = nn.Dropout2d(dropprob)
        
    def forward(self, input):
        residual = input
        output = self.conv3x1_1(input)
        output = self.relu(output)
        output = self.conv1x3_1(output)
        output = self.bn1(output)
        output = self.relu(output)

        output = self.conv3x1_2(output)
        output = self.relu(output)
        output = self.conv1x3_2(output)
        output = self.bn2(output)   

        if (self.dropout.p != 0):
            output = self.dropout(output)
        
        return F.relu(residual+output,inplace=True) 


class PFCU(nn.Module):
    def __init__(self,chann):
        """
        Parallel Factorized Convolution Unit
        """         
    
        super(PFCU,self).__init__()
        
        self.conv3x1_1 = nn.Conv2d(chann, chann, (3,1), stride=1, padding=(1,0), bias=True)

        self.conv1x3_1 = nn.Conv2d(chann, chann, (1,3), stride=1, padding=(0,1), bias=True)

        self.bn1 = nn.BatchNorm2d(chann, eps=1e-03)

        self.conv3x1_22 = nn.Conv2d(chann, chann, (3,1), stride=1, padding=(2,0), bias=True, dilation = (2,1))
        self.conv1x3_22 = nn.Conv2d(chann, chann, (1,3), stride=1, padding=(0,2), bias=True, dilation = (1,2))

        self.conv3x1_25 = nn.Conv2d(chann, chann, (3,1), stride=1, padding=(5,0), bias=True, dilation = (5,1))
        self.conv1x3_25 = nn.Conv2d(chann, chann, (1,3), stride=1, padding=(0,5), bias=True, dilation = (1,5))

        self.conv3x1_29 = nn.Conv2d(chann, chann, (3,1), stride=1, padding=(9,0), bias=True, dilation = (9,1))
        self.conv1x3_29 = nn.Conv2d(chann, chann, (1,3), stride=1, padding=(0,9), bias=True, dilation = (1,9))

        self.bn2 = nn.BatchNorm2d(chann, eps=1e-03)

        self.dropout = nn.Dropout2d(0.3)

    def forward(self, input):
        residual = input
        output = self.conv3x1_1(input)
        output = F.relu(output)
        output = self.conv1x3_1(output)
        output = self.bn1(output)
        output = F.relu(output)

        output2 = self.conv3x1_22(output)
        output2 = F.relu(output2)
        output2 = self.conv1x3_22(output2)
        output2 = self.bn2(output2)
        if (self.dropout.p != 0):
            output2 = self.dropout(output2)

        output5 = self.conv3x1_25(output)
        output5 = F.relu(output5)
        output5 = self.conv1x3_25(output5)
        output5 = self.bn2(output5)
        if (self.dropout.p != 0):
            output5 = self.dropout(output5)

        output9 = self.conv3x1_29(output)
        output9 = F.relu(output9)
        output9 = self.conv1x3_29(output9)
        output9 = self.bn2(output9)
        if (self.dropout.p != 0):
            output9 = self.dropout(output9)

        return F.relu(residual+output2+output5+output9,inplace=True)

		
class ESNet(nn.Module):
    def __init__(self, classes):
        super().__init__()
        #-----ESNET---------#
        self.initial_block = DownsamplerBlock(3,16)

        self.layers = nn.ModuleList()
        
        for x in range(0, 3):
           self.layers.append(FCU(16, 3, 0.03, 1))  
        
        self.layers.append(DownsamplerBlock(16,64))

        for x in range(0, 2):
           self.layers.append(FCU(64, 5, 0.03, 1))  

        self.layers.append(DownsamplerBlock(64,128))

        for x in range(0, 3):   
            self.layers.append(PFCU(chann=128)) 

        self.layers.append(UpsamplerBlock(128,64))
        self.layers.append(FCU(64, 5, 0, 1))
        self.layers.append(FCU(64, 5, 0, 1))

        self.layers.append(UpsamplerBlock(64,32))
        self.layers.append(FCU(32, 3, 0, 1))
        self.layers.append(FCU(32, 3, 0, 1))

        self.output_conv = nn.ConvTranspose2d( 32, classes, 2, stride=2, padding=0, output_padding=0, bias=True)
      

    def forward(self, input):
        output = self.initial_block(input)

        for layer in self.layers:
            output = layer(output)

        output = self.output_conv(output)
      
       

        return output


"""print layers and params of network"""
if __name__ == '__main__':
    model = ESNet(classes=60)
    summary(model,(3,224,224))

I don’t see any code snippets which would explain why the output is in the range [0, 255] and if I’m not missing something from the code I assume your output contains class logits in the range [-Inf, Inf] in the shape [batch_size, nb_classes, height, width].
If that’s the case, you can get the predicted classes for each pixel via:

pred = torch.argmax(output, dim=1)
1 Like