Modify ResNet50 latest 2 layer

I got a modified resnet, it change the latest AVGPool2d layer and remove latest Linear layer. when AVGPool2d's parameter stride not setting , it will be setting with kernel_size.

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)  # official is `nn.AvgPool2d(7, stride=1)`
        # self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # x = self.fc(x)

        return x

then, I found a simple way to reconstruct the resnet. just remove latest 2 layer and add a new AvgPool2d with stride is None.

class ResNet50MoveLinear(nn.Module):
    def __init__(self):
        """Load the pretrained ResNet-50 and replace top fc layer."""
        super(ResNet50MoveLinear, self).__init__()
        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-2]  # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.avgpool = nn.AvgPool2d(7)
        # self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        # self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        """Extract feature vectors from input images."""
        # with torch.no_grad():
        #     features = self.resnet(images)
        # features = features.reshape(features.size(0), -1)
        # features = self.bn(self.linear(features))

        features = self.resnet(images)  # need gradient
        features = self.avgpool(features)
        features = features.view(features.size(0), -1)  # reshape
        return features

I think these 2 methods will be same and will get a similar results, but second method get a worse results(accuracy: 0.918 -> 0.895). and I can’t find the difference.

I would greatly appreciate for some suggestions, thanks!

1 Like

Was this difference reproducible for different runs and seeds?
At first glance both models look identical.

Very happy to meet again, @ptrblck !

I have already set the seed at the application head:

# setting seed
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

and got different results, maybe I should add random.seed and numpy random.seed?

thanks!

Maybe, but lets first focus a bit on your code.
In the second example you are using a pretrained ResNet.
Are you loading the same state_dict for the first approach?

hi, @ptrblck

I found the problem, but I can’t find the solution.

the modified resnet is here:

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)
        # self.fc = nn.Linear(512 * block.expansion, num_classes)
        # self.fc2 = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        # x = self.fc(x)

        return x

when I print the instance of ResNet model, I found that the resnet structure is defined in __init__ function, not in forward funciton. so, comment self.fc or self.fc2 will get two differenct tructure and running results, but the network will run ok(it looks like not impact the forward process(not call in forward function), but impact backward process).

I only comment self.fc2, not comment self.fc1 in __init__ will get same results with below code

class ResNet50MoveLinear(nn.Module):
    def __init__(self):
        """Load the pretrained ResNet-50 and replace top fc layer."""
        super(ResNet50MoveLinear, self).__init__()
        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-2]  # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.avgpool = nn.AvgPool2d(7)
        # self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        # self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        """Extract feature vectors from input images."""
        # with torch.no_grad():
        #     features = self.resnet(images)
        # features = features.reshape(features.size(0), -1)
        # features = self.bn(self.linear(features))

        features = self.resnet(images)  # need gradient
        features = self.avgpool(features)
        features = features.view(features.size(0), -1)  # reshape
        return features

when I print the instance of ResNet50MoveLinear model, it not contain the latest self.fc layer, but it seem still works. and impact the backward process.

I tried some method to delete the latest self.fc with different ways:

resnet = models.resnet50(pretrained=True)
# method 1
modules = list(resnet.children())[:-1]
# method 2
resnet.__delattr__('fc')
# mothod 3
resnet.fc = None

it is running ok, but seem still latest self.fc work with backward process.

thanks!

In python, [:,-2] pratically removes the last two layers in ResNet, including self.layer4 and self.fc, causing your modified ResNet50MoveLinear network to lack β€˜layer4β€™οΌŒ which degraded performance.

thanks, @Sunshine352

my comment is not correct, I remove latest 2 layer include a fc and AvgPool2d layer in my code.

and my question is using nn.Sequential(*list(resnet.children())[:-1]) seens not change the define of network structure, and only change the forward process, and impact the backward process.

@ptrblck @Sunshine352

I sorted out the problem, and I hope will be more clear with my problem.

Reconstruct network problem

problem

in pytorch, the network structure is defined in function __init__. when use nn.Sequential(*list(resnet.children())[:-1]) to reconstruct net, only impact the forward process of the original network structure, but not change the backward process in original network structure(even if print(model) show that model structure is already changed).

test

basic code from: yunjey/pytorch-tutorial, it is a good work!

I have a 4 test, include

0: run ResNet, default.
1: run ResNet, and add a new self.fc2 in __init__, but not call in forward.
2: run ResNet2 to call ResNet, remove latest fc in ResNet2, and add a new fc in ResNet2.
3: run ResNet2 to call ResNet, comment latest fc in ResNet, and add a new fc in ResNet2.

the test result is below

0: run ResNet, default.
​ self.fc exist in __init__.
​ Epoch [1/80], Step [100/500] Loss: 1.7493
​ Epoch [1/80], Step [200/500] Loss: 1.4796
1: run ResNet, and add a new self.fc2 in __init__, but not call in forward.
​ self.fc exist in __init__.
​ add a new self.fc2 in __init__.
​ Epoch [1/80], Step [100/500] Loss: 1.6957
​ Epoch [1/80], Step [200/500] Loss: 1.5956
2: run ResNet2 to call ResNet, remove latest fc in ResNet2, and add a new fc in ResNet2.
​ self.fc exist in __init__.
​ Epoch [1/80], Step [100/500] Loss: 1.6155
​ Epoch [1/80], Step [200/500] Loss: 1.5825
3: run ResNet2 to call ResNet, comment latest fc in ResNet, and add a new fc in ResNet2.
​ comment self.fc and self.fc2 in __init__.
​ Epoch [1/80], Step [100/500] Loss: 1.7493
​ Epoch [1/80], Step [200/500] Loss: 1.4796

analysis

0 vs 1

define the self.fc2 in __init__ but not call in forward impact the result(seem impact the backword process).

0 vs 2

use nn.Sequential(*list(resnet.children())[:-1]) to remove latest self.fc in ResNet, and add a new self.fc in ResNet2. and get a different results(seem self.fc in ResNet still worked).

0 vs 3

comment self.fc in ResNet, and add a self.fc in ResNet2, it get same results.

end

I think the guess is confirmed.

test code

# ---------------------------------------------------------------------------- #
# An implementation of https://arxiv.org/pdf/1512.03385.pdf                    #
# See section 4.2 for the model architecture on CIFAR-10                       #
# Some part of the code was referenced from below                              #
# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py   #
# ---------------------------------------------------------------------------- #
import random

import numpy
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# setting seed
random.seed(0)
numpy.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

"""
TEST_ID for net reconstruct test.
0: run ResNet, default. 
1: run ResNet, add a new `self.fc2` in `__init__`, but not call in `forward`.
2: run ResNet2, remove latest fc in ResNet2, and add a new fc in ResNet2.
3: run ResNet2, remove latest fc in ResNet, and add a new fc in ResNet2, comment `self.fc` in class ResNet function `__init__`.
"""
TEST_ID = 3

# Device configuration
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

# Hyper-parameters
num_epochs = 80
learning_rate = 0.001

# Image preprocessing modules
transform = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32),
    transforms.ToTensor()])

# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data/',
                                             train=True,
                                             transform=transform,
                                             download=True)

test_dataset = torchvision.datasets.CIFAR10(root='./data/',
                                            train=False,
                                            transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100,
                                          shuffle=False)


# 3x3 convolution
def conv3x3(in_channels, out_channels, stride=1):
    return nn.Conv2d(in_channels, out_channels, kernel_size=3,
                     stride=stride, padding=1, bias=False)


# Residual block
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = conv3x3(in_channels, out_channels, stride)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(out_channels, out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out


# ResNet
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10):
        super(ResNet, self).__init__()
        self.in_channels = 16
        self.conv = conv3x3(3, 16)
        self.bn = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, 16, layers[0])
        self.layer2 = self.make_layer(block, 32, layers[1], 2)
        self.layer3 = self.make_layer(block, 64, layers[2], 2)
        self.avg_pool = nn.AvgPool2d(8)

        if TEST_ID in (0, 1, 2):
            print("`self.fc` exist in `__init__`.")
            self.fc = nn.Linear(64, num_classes)  # remove `self.fc`
        if TEST_ID == 1:
            print("add a new `self.fc2` in `__init__`.")
            self.fc2 = nn.Linear(64, num_classes)  # add new `self.fc2`
        if TEST_ID == 3:
            print("comment `self.fc` and `self.fc2` in `__init__`.")

    def make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if (stride != 1) or (self.in_channels != out_channels):
            downsample = nn.Sequential(
                conv3x3(self.in_channels, out_channels, stride=stride),
                nn.BatchNorm2d(out_channels))
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for i in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv(x)
        out = self.bn(out)
        out = self.relu(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.avg_pool(out)
        out = out.view(out.size(0), -1)
        if TEST_ID in (0, 1, 2):
            out = self.fc(out)
        return out


# ResNet
class ResNet2(nn.Module):
    def __init__(self, block, layers, num_classes=10):
        super(ResNet2, self).__init__()
        resnet = ResNet(ResidualBlock, [2, 2, 2])

        if TEST_ID == 2:
            self.resnet = nn.Sequential(*list(resnet.children())[:-1])  # remove latest fc
        elif TEST_ID == 3:
            self.resnet = nn.Sequential(
                *list(resnet.children()))  # remove latest fc in ResNet comment

        self.fc = nn.Linear(64, num_classes)  # add a new fc

    def forward(self, x):
        out = self.resnet(x)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out


if TEST_ID in (0, 1):
    model = ResNet(ResidualBlock, [2, 2, 2]).to(device)
elif TEST_ID in (2, 3):
    model = ResNet2(ResidualBlock, [2, 2, 2]).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# For updating learning rate
def update_lr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


# Train the model
total_step = len(train_loader)
curr_lr = learning_rate
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        if i == 201: exit(1)
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}"
                  .format(epoch + 1, num_epochs, i + 1, total_step,
                          loss.item()))

    # Decay learning rate
    if (epoch + 1) % 20 == 0:
        curr_lr /= 3
        update_lr(optimizer, curr_lr)

# Test the model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the model on the test images: {} %'.format(
        100 * correct / total))

# Save the model checkpoint
torch.save(model.state_dict(), 'resnet.ckpt')