Add Residual connection

kl_divergence · June 23, 2018, 5:15pm

My models looks like this:

class hybrid_cnn(nn.Module):
    def __init__(self,**kwargs):
        super(hybrid_cnn,self).__init__()
        resnet = torchvision.models.resnet50(pretrained=True)
        self.base = nn.Sequential(*list(resnet.children())[:-2])
    def forward(self,x):
        x = self.base(x)
        clf_outputs = {}

I want to add residual connection from Resnet to conv2 in both blocks. I am not able to find implementations online plus other questions regarding ResNets have not been answered.

ptrblck · June 23, 2018, 8:08pm

You could have a look at the implementation in torchvision.models.
There you will find the BasicBlock and Bottleneck layers which apply the residual connection.

kl_divergence · June 24, 2018, 4:59pm

I Checked it, but still I have doubts in implementation, could you please provide a small snippet?

ptrblck · June 24, 2018, 5:10pm

Sure, here is a simple one:

class block1(nn.Module):
    def __init__(self,ni):
        super(block1, self).__init__()
        self.conv1 = nn.Conv2d(ni, ni, 1)
        self.conv2 = nn.Conv2d(ni, ni, 3, 1, 1)
        self.classifier = nn.Linear(ni*24*24,751)

    def forward(self,x):
        residual = x
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        
        out += residual
        
        out = out.view(out.size(0),-1)
        return self.classifier(out)

block = block1(16)
x = torch.randn(1, 16, 24, 24)
output = block(x)

Depending on your number of in and out channels, you might need to add a downsample layer as in the examples.

Ditlev_Jorgensen · October 25, 2018, 2:44pm

Thanks for the great snippet! I just have one question. With additional blocks is the residual added like this:

def forward(self,x):
        #First block
        residual = x
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        
        out += residual
        
        #Second block
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        
        out += residual

        out = out.view(out.size(0),-1)
        return self.classifier(out)

or like this:

def forward(self,x):
        #First block
        residual1 = x
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        
        out += residual1
        residual2 = out

        #Second block
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        
        out += residual2

        out = out.view(out.size(0),-1)
        return self.classifier(out)

Clarification is much appreciated. I dont seem to be able to de-code the pytorch vision model

ptrblck · October 25, 2018, 2:57pm

I’m not sure how the second block works, as you are using the same layers with the same input.
Basically

out = F.relu(self.conv1(x))
out = F.relu(self.conv2(out))

is repeated, so that you could remove one call of these layers.

However, in your first example you are throwing away out using the second block, so only one block is used.
In your second example you are calculating out = out + (out + x). So again the second block is unnecessary, since you’ve already calculated the output of the block.

Could you explain your use case a bit?

Ditlev_Jorgensen · October 25, 2018, 8:53pm

Sorry I just copy-pasted the example code.

What i want to know is how the blocks are connected.
Lets say i make a class containing two blocks each containing 3 convolutional layers:

class Two_Blocks(nn.Module):
    def __init__(self):
        self.block1 = nn.Sequential(         
            nn.Conv1d(
                in_channels=128,              
                out_channels=128,             
                kernel_size=5,              
                stride=1,                   
                padding=2,                  
            ),
            nn.BatchNorm1d(128),                       
            nn.ReLU(),
            nn.Conv1d(
                in_channels=128,              
                out_channels=128,             
                kernel_size=5,              
                stride=1,                   
                padding=2,                  
            ),
            nn.Conv1d(
                in_channels=128,              
                out_channels=128,             
                kernel_size=5,              
                stride=1,                   
                padding=2,                  
            ),
            nn.BatchNorm1d(128),                       
            nn.ReLU(),
        )
        self.block2 = nn.Sequential(         
            nn.Conv1d(
                in_channels=128,              
                out_channels=128,             
                kernel_size=3,              
                stride=1,                   
                padding=1,                  
            ),
            nn.BatchNorm1d(128),                       
            nn.ReLU(),
            nn.Conv1d(
                in_channels=128,              
                out_channels=128,             
                kernel_size=3,              
                stride=1,                   
                padding=1,                  
            ),
            nn.Conv1d(
                in_channels=128,              
                out_channels=128,             
                kernel_size=3,              
                stride=1,                   
                padding=1,                  
            ),
            nn.BatchNorm1d(128),                       
            nn.ReLU(),
        )   

    ### END ###
        
        self.Global_Avg_Pool = nn.AvgPool1d(kernel_size = 2)
        
        self.end = nn.Sequential(
                nn.Dropout(p=0.3),
                nn.Linear(128*1000, 3),
        )
        
        
    def forward(self, x):
        
        residual1 = x    #Save input as residual
        x = self.block1(x)
        
        x += residual1 #add input to output of block1
        residual2 = x  #save output of block1 as residual
        
        x = self.block2(x)
        
        x += residual2 #add output of block1 to output of block2
        
        x = self.Global_Avg_Pool(x) #Global average pooling instead of fully connected.
        x = x.view(-1, 128*1000)
        x = F.softmax(self.end(x),dim=1)
        
        return x

Does this make it more clear?

The other way would be to do the following instead:

def forward(self, x):
        
        residual = x   #Save input as residual
        x = self.block1(x)
        
        x += residual #add input to output of block1
       
        x = self.block2(x)
        
        #The same input is added for block 2 as for block 1:
        x += residual #add input to output of block2 
        
        x = self.Global_Avg_Pool(x) #Global average pooling instead of fully connected.
        x = x.view(-1, 128*1000)
        x = F.softmax(self.end(x),dim=1)
        
        return x

In my head the first example is the correct one. But just wanted to be sure that the residual saved in block one is not the same used for all blocks throughout the model. Sorry if i overcomplicated my question.

ptrblck · October 25, 2018, 9:10pm

The first example looks like the “common” res net architecture, i.e. you add the residual before the block to its output.
I wouldn’t say it’s the right approach, as the second one also looks interesting. There you add the same residual to both block outputs. It looks a bit like Densely Connected Convolutional Networks.

Anyway, if you want to stick to a ResNet-lile architecture, the first approach looks good.

Ditlev_Jorgensen · October 26, 2018, 7:23am

Thanks alot for clarifying!

Sushil_Khadka · July 10, 2023, 7:26pm

residual = x
Doesn’t this stmt do copy by reference? So residual variable will point to the same memory location as out. Please correct me If I’m wrong.

ptrblck · July 10, 2023, 10:39pm

It won’t as seen in this simple test:

class block1(nn.Module):
    def __init__(self, ni):
        super(block1, self).__init__()
        self.conv1 = nn.Conv2d(ni, ni, 1)

    def forward(self,x):
        residual = x
        out = F.relu(self.conv1(x))
        print((residual - out).abs().max())
        return out

m = block1(1)
x = torch.randn(1, 1, 224, 224)
out = m(x)
# tensor(4.7888, grad_fn=<MaxBackward1>)

since neither self.conv1 nor F.relu are manipulating the input inplace.
In case you are unsure if any layer will manipulate the input inplace, you should clone() it:

# wrong since relu will manipulate x inplace and thus also residual
class block1(nn.Module):
    def __init__(self):
        super(block1, self).__init__()

    def forward(self,x):
        residual = x
        out = F.relu(x, inplace=True)
        print((residual - out).abs().max())
        return out

m = block1()
x = torch.randn(1, 1, 224, 224)
out = m(x)
# tensor(0.)

# right
class block1(nn.Module):
    def __init__(self):
        super(block1, self).__init__()

    def forward(self,x):
        residual = x
        out = F.relu(x.clone(), inplace=True)
        print((residual - out).abs().max())
        return out

m = block1()
x = torch.randn(1, 1, 224, 224)
out = m(x)
# tensor(3.7760)

Sushil_Khadka · July 12, 2023, 5:58pm

Thanks! I now understand very well.