After placing a residual block in the model, the loss value does not decrease

mfcs · September 18, 2020, 2:31pm

Hi everyone!

Could you help me with the following issue, please?

My network architecture is a basic CNN, but I placed a residual block to see how the model would perform. The loss value does not decrease and I don’t know what the problem is.
For the residual block, I am getting the conv2(128 layers) and using it as input to the residual block. The output from the residual block, I am concatenating with the conv8(128 layers) resulting in a tensor of 256 layers.
Am I using the residual block in a correct way?

Here is the model architecture:

# Imports
import torch
import torch.nn as nn
from torchvision import models
import torch.nn.functional as F

class ScaleLayer(nn.Module):

   def __init__(self, init_value=1e-3):
       super().__init__()
       self.scale = nn.Parameter(torch.FloatTensor([init_value]))

   def forward(self, input):
       return input * self.scale

def weights_init(model):
    if type(model) in [nn.Conv2d, nn.Linear]:
        nn.init.xavier_normal_(model.weight.data)
        nn.init.constant_(model.bias.data, 0.1)

class ResidualBlock(nn.Module):
    def __init__(self):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Sequential(
            # conv1
            nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features = 128),
        )

    def forward(self, x):
        conv1=self.conv1(x)
        return conv1 

class Color_model(nn.Module):
    def __init__(self):
        super(Color_model, self).__init__()
        self.residual_block = ResidualBlock().cuda()
        self.conv1 = nn.Sequential(
            # conv1
            nn.Conv2d(in_channels = 1, out_channels = 64, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 2, padding = 1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features = 64),
        )
        
        self.conv2 = nn.Sequential(
            # conv2
            nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 2, padding = 1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features = 128),
        )
        
        self.conv3 = nn.Sequential(
            # conv3
            nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 2, padding = 1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features = 256),
        )

        self.conv4 = nn.Sequential(
            # conv4
            nn.Conv2d(in_channels = 256, out_channels = 512, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features = 512),
        )

        self.conv5 = nn.Sequential(
            # conv5
            nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 2, dilation = 2),
            nn.ReLU(),
            nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 2, dilation = 2),
            nn.ReLU(),
            nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 2, dilation = 2),
            nn.ReLU(),
            nn.BatchNorm2d(num_features = 512),
        )

        self.conv6 = nn.Sequential(
            # conv6
            nn.ReLU(),
            nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 2, dilation = 2),
            nn.ReLU(),
            nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 2, dilation = 2),
            nn.ReLU(),
            nn.Conv2d(in_channels = 512, out_channels = 512, kernel_size = 3, stride = 1, padding = 2, dilation = 2),
            nn.ReLU(),
            nn.BatchNorm2d(num_features = 512),
        )

        self.conv7 = nn.Sequential(
            # conv7
            nn.Conv2d(in_channels = 512, out_channels = 256, kernel_size = 3, stride = 1, padding = 1, dilation = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 1, padding = 1, dilation = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size = 3, stride = 1, padding = 1, dilation = 1),
            nn.ReLU(),
            nn.BatchNorm2d(num_features = 256),
        )

        self.conv8 = nn.Sequential(
            # conv8
            nn.ConvTranspose2d(in_channels = 256, out_channels = 128, kernel_size = 4, stride = 2, padding = 1, dilation = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1, dilation = 1),
            nn.ReLU(),
            nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = 3, stride = 1, padding = 1, dilation = 1),
            nn.ReLU(),
        )

        # self.conv9 = nn.Sequential(
        #     # conv9
        #     nn.ConvTranspose2d(in_channels = 128, out_channels = 64, kernel_size = 4, stride = 2, padding = 1, dilation = 1),
        #     nn.ReLU(),
        #     nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1, dilation = 1),
        #     nn.ReLU(),
        #     nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1, dilation = 1),
        #     nn.ReLU(),
        # )

        # self.conv10 = nn.Sequential(
        #     # conv10
        #     nn.ConvTranspose2d(in_channels = 64, out_channels = 64, kernel_size = 4, stride = 2, padding = 1, dilation = 1),
        #     nn.ReLU(),
        #     nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1, dilation = 1),
        #     nn.ReLU(),
        #     nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1, dilation = 1),
        #     nn.ReLU(),
        # )

        self.conv8_313 = nn.Sequential(
            # conv8_313
            nn.Conv2d(in_channels = 256, out_channels = 313, kernel_size = 1, stride = 1, dilation = 1),
            nn.ReLU()
        )

        #self.conv8_313 = nn.Conv2d(in_channels = 128, out_channels = 313, kernel_size = 1, stride = 1, dilation = 1)
        
        self.apply(weights_init)

    def forward(self, gray_image):        

        conv1=self.conv1(gray_image)
        conv2=self.conv2(conv1)
        conv2_residual = self.residual_block(conv2)
        conv3=self.conv3(conv2)
        conv4=self.conv4(conv3)
        conv5=self.conv5(conv4)
        conv6=self.conv6(conv5)
        conv7=self.conv7(conv6)
        conv8=torch.cat((self.conv8(conv7),conv2_residual), 1)
        # conv9=self.conv9(conv8)
        # conv10=self.conv10(conv9)
        features = self.conv8_313(conv8)
        
        return features

        # conv1 torch.Size([1, 64, 112, 112])
        # conv2 torch.Size([1, 128, 56, 56])
        # conv3 torch.Size([1, 256, 28, 28])
        # conv4 torch.Size([1, 512, 28, 28])
        # conv5 torch.Size([1, 512, 28, 28])
        # conv6 torch.Size([1, 512, 28, 28])
        # conv7 torch.Size([1, 256, 28, 28])
        # conv8 torch.Size([1, 128, 56, 56])
        # features torch.Size([1, 313, 56, 56])

And here are the loss values through 30 epochs. I cancelled the training when I saw that the loss value does not decrease.

 Epoch [1/180], Epoch Loss: 129.40, Learning Rate: [0.1]
 Epoch [2/180], Epoch Loss: 2.9342, Learning Rate: [0.1]
 Epoch [3/180], Epoch Loss: 2.9005, Learning Rate: [0.1]
 Epoch [4/180], Epoch Loss: 2.8906, Learning Rate: [0.1]
 Epoch [5/180], Epoch Loss: 2.8772, Learning Rate: [0.1]
 Epoch [6/180], Epoch Loss: 2.8853, Learning Rate: [0.1]
 Epoch [7/180], Epoch Loss: 2.8984, Learning Rate: [0.1]
 Epoch [8/180], Epoch Loss: 2.8762, Learning Rate: [0.1]
 Epoch [9/180], Epoch Loss: 2.8748, Learning Rate: [0.1]
 Epoch [10/180], Epoch Loss: 2.8687, Learning Rate: [0.1]
 Epoch [11/180], Epoch Loss: 2.8682, Learning Rate: [0.1]
 Epoch [12/180], Epoch Loss: 2.8680, Learning Rate: [0.1]
 Epoch [13/180], Epoch Loss: 2.8691, Learning Rate: [0.1]
 Epoch [14/180], Epoch Loss: 2.8468, Learning Rate: [0.1]
 Epoch [15/180], Epoch Loss: 2.8512, Learning Rate: [0.1]
 Epoch [16/180], Epoch Loss: 2.8595, Learning Rate: [0.1]
 Epoch [17/180], Epoch Loss: 2.8543, Learning Rate: [0.1]
 Epoch [18/180], Epoch Loss: 2.8372, Learning Rate: [0.1]
 Epoch [19/180], Epoch Loss: 2.8519, Learning Rate: [0.1]
 Epoch [20/180], Epoch Loss: 2.8344, Learning Rate: [0.1]
 Epoch [21/180], Epoch Loss: 21.456, Learning Rate: [0.1]
 Epoch [22/180], Epoch Loss: 5.7463, Learning Rate: [0.1]
 Epoch [23/180], Epoch Loss: 5.7368, Learning Rate: [0.1]
 Epoch [24/180], Epoch Loss: 5.7448, Learning Rate: [0.1]
 Epoch [25/180], Epoch Loss: 5.7260, Learning Rate: [0.1]
 Epoch [26/180], Epoch Loss: 5.7462, Learning Rate: [0.1]
 Epoch [27/180], Epoch Loss: 5.7462, Learning Rate: [0.1]
 Epoch [28/180], Epoch Loss: 5.7462, Learning Rate: [0.1]
 Epoch [29/180], Epoch Loss: 5.7462, Learning Rate: [0.1]
 Epoch [30/180], Epoch Loss: 5.7462, Learning Rate: [0.1]

Best regards,

Matheus Santos.

ptrblck · September 19, 2020, 8:20am

Your model seems to initially learn and based on the output I would recommend to lower the learning rate and play around with other hyperparameters.
The current setup might get stuck at a high loss due to a large learning rate.

mfcs · September 19, 2020, 6:30pm

Hey @ptrblck! Thanks for replying!

I see, I will train using a smaller learning rate.

But am I using the residual block in a correct way?
I am asking that because before I add the residual block, the loss value during the training was decreasing and the learning rate was the same value, 0.1.

ptrblck · September 20, 2020, 2:47am

The ResidualBlock module doesn’t use a residual connection, so I was a bit confused at the beginning.

The usage of conv2_residual in your model model looks alright and the concatenation might be seen as a residual connection.
Note however, that the “classic” residual connection uses an addition as seen in the resnet code.

mfcs · September 22, 2020, 1:20am

Hey @ptrblck!

What I mean with Residual Block is illustrated in the image below:
The Residual Block will get as input the tensor from conv2, apply some convolutions and the output of the Residual Block will be added to the tensor of conv8(I will change my implementation to sum ehehehehe).
In my understanding, the Residual Block and the operation of sum would be the “Residual Connection”…I didn´t understood why the sum operation is performed inside the Residual Block as you showed on the link is the “Residual connection”. Could you explain?

Abaout the loss values are not decreasing, do you think it would just be an adjustment to the larning rate or could this Residual Block be interfering with that?

Obs: the tensor’s dimensions in the code are a little bit differente from the image.
architecture

ptrblck · September 22, 2020, 4:48am

The naming just seems to be different.
The linked code snippet names the skip connection with the “conv” path a BasicBlock or Bottleneck, while you call the operations on the skip connection ResidualBlock.

That’s hard to tell. Could you try to overfit a small dataset and check, if your current architecture is able to do so? If not, there might be other bugs in the code we are missing at this moment.

mfcs · September 22, 2020, 5:32pm

Hey @ptrblck!
Now I saw what I am doing in the code ahahahaha I understood your point.
In the code snippet the BasicBlock and Bottleneck are blocks of convolutions layers that have a skip connection inside and these blocks are placed in a way that generate a ResNet, right?
For example:
input → BasicBlock → Bottleneck → BasicBlock → Bottleneck → BasicBlock → output
Something like this, and the skip connections between these blocks are the ones inside each block.
Am I right?

I see. I will make more tests here and try to understand what is happening.

ptrblck · September 22, 2020, 11:29pm

Yes, your explanation generally correct.
Note that either BasicBlock or Bottleneck will be used in the resnet (depending on the depth of the model, i.e. resnet18, resnet52, etc.) not both.

mfcs · September 23, 2020, 12:14am

Yes!! I got a little confused
I understand! Make sense hahahaha

mfcs · September 29, 2020, 11:37pm

Hey @ptrblck!
Could you help me here? I have some doubts…

How can I see the gradients of my network to check if vanishing gradients is happening?
How can I check if the skip connection that I added in my model is working properly with respect to the gradients? I mean, the skip connection goal is prevent vanishing gradients in the early layers, so how can I check if the gradients in the early layers are being strongly updated?

Best regards,

Matheus Santos.

ptrblck · September 30, 2020, 9:03am

You can inspect gradients by e.g. using register_hook on all parameters and then by printing or storing the value or alternatively by checking the .grad attribute of all parameters after the backward call:

loss.backward()
for name, param in model.named_parameters():
    print(name, param.grad)

To see if the gradient is vanishing you could print its norm or absolute value (or any other reduction which makes sense) and compare the magnitude with and without the skip connections.

Usama_Hasan · September 30, 2020, 9:19am

Hy

Just to add it in this conversation, skip connections are actually there to solve the degradation problem in deeper net as mentioned in the paper. vanishing gradient can be controlled by other means say normalization
. resnet