Custom Residual CNN Test Accuracy Stagnating

I’m trying to recreate some of the findings from the paper “Deep Residual Learning for Image Recognition”, but I am finding for higher depths, my Residual version of the model is stagnating at around 25% test error, compared to around 17% for the plain counterpart, even though the training error rate is much better (~5% vs 13%).

I’m also seeing an issue where my Residual CNN is unstable at the learning rate from the paper (0.1 - it’s test and train error increase to 90%, or accuracy 1/n_classes where I’m using the CIFAR-10 dataset). This issue is not seen for the plain counterpart.

In order to resolve this I decreased the learning rate to 0.01, although I then have to decrease the learning rate to 10% at around epoch 1000 otherwise it is still unstable.

I’m wondering if my issues are general issues seen using residual CNNs, or whether there is something wrong with my implementation, where the architecure is taken directly from the paper (the CIFAR-10 examlpe). The model class is below which allows setting the model to either residual or plain, and the depth - note this issue is only with the n is 7 or 9 variants.

The other hyperparamters I’m using are taken directly from the paper (weight decay = 0.0001, momentum = 0.9) and I’m using the same data augmentation from the paper (random cropping, padding and random horizontal flip).

Are there any obvious issues with my model implementation?

class CNNModel(nn.Module):
def init(self, n_size_factor, res_net=False):

    super().__init__()

    self.res_net = res_net
    
    self.layers = nn.ModuleDict()
    
    # "The first layer is 3x3 convolutions."
    self.layers['conv_initial'] = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding='same')
    # "We adopt batch normalization (BN) [16] right after each convolution and before activation" - Activation is set in forward call.
    self.layers['conv_initial_bn'] = nn.BatchNorm2d(16)
      

    # "Then we use a stack of 6n layers with 3x3 convolutions on the feature maps of sizes {32; 16; 8} respectively,
    # with 2n layers for each feature map size. The numbers of filters are {16; 32; 64} respectively."

    # 2n x 16 filters
    for i in range(n_size_factor):
        self.layers['conv_layer_16_'+str(i+1)+'_input'] = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding='same')
        self.layers['conv_layer_16_bn_'+str(i+1)+'_input'] = nn.BatchNorm2d(16)
        self.layers['conv_layer_16_bn_'+str(i+1)+'_input']
        
        self.layers['conv_layer_16_'+str(i+1)+'_output'] = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding='same')
        self.layers['conv_layer_16_bn_'+str(i+1)+'_output'] = nn.BatchNorm2d(16)
    
    # 2n x 32 filter layers
    for i in range(n_size_factor):
        if i == 0: 
            # "The subsampling is performed by convolutions with a stride of 2."
            self.layers['conv_layer_32_subsampled_'+str(i+1)+'_input'] = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1)
            
        else:
             self.layers['conv_layer_32_'+str(i+1)+'_input'] = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding='same')
        
        self.layers['conv_layer_32_bn_'+str(i+1)+'_input'] = nn.BatchNorm2d(32)
        
        
        self.layers['conv_layer_32_'+str(i+1)+'_output'] = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding='same')
        self.layers['conv_layer_32_bn_'+str(i+1)+'_output'] = nn.BatchNorm2d(32)
        
    # 2n x 64 filter layers
    for i in range(n_size_factor):
        if i == 0: 
            # "The subsampling is performed by convolutions with a stride of 2."
            self.layers['conv_layer_64_subsampled_'+str(i+1)+'_input'] = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        
        else:
            self.layers['conv_layer_64_'+str(i+1)+'_input'] = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding='same')
        
        self.layers['conv_layer_64_bn_'+str(i+1)+'_input'] = nn.BatchNorm2d(64)
        
        
        self.layers['conv_layer_64_'+str(i+1)+'_output'] = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding='same')
        self.layers['conv_layer_64_bn_'+str(i+1)+'_output'] = nn.BatchNorm2d(64)   
        
    # "The network ends with a global average pooling, a 10-way fully-connected layer, and softmax.""
    self.layers['global_average_pool'] = nn.AvgPool2d(kernel_size=8)
    
    self.layers['flatten'] = nn.Flatten()
    self.layers['fuly_connected'] = nn.Linear(64, 10)
    
    self.layers['soft_max'] = nn.Softmax(1)

    
    # Apply weight initialisation
    self.apply(self.init_weights)


def init_weights(self, m):
    # apply He weight initialisation to linear and convolutional layers
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
        torch.nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
        m.bias.data.fill_(0.01)
        

def shortcut_input(self, x, layer):
    self.x_shortcut = x.clone().detach()
    #resize for subsampled layers
    if 'subsampled' in layer:
        self.x_shortcut = self.shortcut_dimension_increase(self.x_shortcut)

        
def shortcut_dimension_increase(self, x):
    x = nn.functional.max_pool2d(x, kernel_size=1, stride=2)
    x = torch.cat([x, torch.zeros(x.shape).to(device=device)], axis=1)
    return x


def forward(self, x):
        
    ## apply layers
    for layer in self.layers:

        if self.res_net:
            # For shortcut input layers create the shortcut connection
            if isinstance(self.layers[layer], nn.Conv2d) and 'input' in layer:
                self.shortcut_input(x, layer)
                x = self.layers[layer](x)
            # For shortcut output layers add on the shortcut connection
            elif isinstance(self.layers[layer], nn.Conv2d) and 'output' in layer:
                x = self.layers[layer](x) + self.x_shortcut
            else:
                x = self.layers[layer](x) 
        else:
            x = self.layers[layer](x)
            
        # "We adopt batch normalization (BN) [16] right after each convolution and before activation"
        if isinstance(self.layers[layer], nn.BatchNorm2d) or isinstance(self.layers[layer], nn.Linear):
            #need to check relu is correct
            x = torch.relu(x)
    
    return x