Loading pretrained weights into new model

hello again,

Can you please help me with how to use pretrained weights for different model. I have built a simple fcn model for segmentation with 5 blocks of conv, relu and pooling. I want to use pretrained weights of vgg/resent. I loaded the resnet pretrained weights as below. Is it the right way? I got better results, but I am not sure how the pretrained weights get added to my new model.

model = fcn()
model.load_state_dict(model_zoo.load_url(model_urls[‘resnet50’]), strict=False)

Is it based on layer names or automatically concat the weights with corresponding filter sizes. Please give me some inputs, since I am new to using pretrained weights.

Thank you

I would not recommend to use strict=False without a proper verification that this workaround loads indeed the proper parameters.
If you’ve changed the model architecture, try to load the parameters for each layer manually e.g. with a custom mapping which would load the parameters of the pretrained layer to the new one.

1 Like

@ptrblck okay, which is the good way?

  1. split the layers of pretrained model and concatinate with the layers of my custom model (according the layers output size which are required for fcn), then train the custom model.
  2. use pretrained weights as features (remove final layers which are not required and custom classifier layers and then train.
    for example in the second method i used vgg features,

class fcn(nn.Module):
def init(self):
super(fcn, self).init()
self.features = vgg16.features
self.classifier = nn.Sequential(nn.Conv2d(512, 4096, 7),nn.ReLU(inplace=True),nn.Conv2d(4096, 4096, 1),nn.ReLU(inplace=True),nn.Conv2d(4096, 32, 1),nn.ConvTranspose2d(32, 32, 224, stride=32)
)

def forward(self, x):
x = self.features(x)
x = self.classifier(x)
#print(x.shape)
return x

Also, in both case, what is the affect of freezing the features. Should i train it or not?
for param in vgg16.features.parameters():
param.requires_grad = False

Thank you

  1. I don’t know how you could like to concatenate the layers and how this would be related to loading the pretrained weights, so could you explain this idea a bit more, please?
  2. Yes, this approach looks valid.

It depends on your use case and you would have to run experiments to determine if freezing the layers would work for you. Often the feature extractor is frozen, if the new dataset is “similar” to the pretrained one (ImageNet), but I wouldn’t claim that’s a general rule.

1 Like

for example this is my custom fcn model (only fcn 32)of 5 stages. As we have 5 pool layers, i want to add this pool output and pool output of vgg pretrained model. And then train this model. Is it possible?

class fcn(nn.Module):

def __init__(self):
    super(fcn, self).__init__()
    self.conv1_1 = nn.Conv2d(3, 64, 3, 1, 1)
    #self.bn1_1 = nn.BatchNorm2d()
    self.relu1_1 = nn.ReLU()
    self.conv1_2 = nn.Conv2d(64, 64, 3, 1, 1)
    #self.bn1_2 = nn.BatchNorm2d()
    self.relu1_2 = nn.ReLU()
    self.pool1 = nn.MaxPool2d(2, 2)#/2

    self.conv2_1 = nn.Conv2d(64, 128, 3, 1, 1)
    # self.bn2_1 = nn.BatchNorm2d()
    self.relu2_1 = nn.ReLU()
    self.conv2_2 = nn.Conv2d(128, 128, 3, 1, 1)
    # self.bn2_2 = nn.BatchNorm2d()
    self.relu2_2 = nn.ReLU()
    self.pool2 = nn.MaxPool2d(2, 2)#/4

    self.conv3_1 = nn.Conv2d(128, 256, 3, 1, 1)
    # self.bn3_1 = nn.BatchNorm2d()
    self.relu3_1 = nn.ReLU()
    self.conv3_2 = nn.Conv2d(256, 256, 3, 1, 1)
    # self.bn3_2 = nn.BatchNorm2d()
    self.relu3_2 = nn.ReLU()
    self.pool3 = nn.MaxPool2d(2, 2)#/8

    self.conv4_1 = nn.Conv2d(256, 512, 3, 1, 1)
    # self.bn4_1 = nn.BatchNorm2d()
    self.relu4_1 = nn.ReLU()
    self.conv4_2 = nn.Conv2d(512, 512, 3, 1, 1)
    # self.bn4_2 = nn.BatchNorm2d()
    self.relu4_2 = nn.ReLU()
    self.pool4 = nn.MaxPool2d(2, 2)  # /16

    self.conv5_1 = nn.Conv2d(512, 1024, 3, 1, 1)
    # self.bn5_1 = nn.BatchNorm2d()
    self.relu5_1 = nn.ReLU()
    self.conv5_2 = nn.Conv2d(1024, 1024, 3, 1, 1)
    # self.bn5_2 = nn.BatchNorm2d()
    self.relu5_2 = nn.ReLU()
    self.pool5 = nn.MaxPool2d(2, 2)  # /32

    self.fc1 = nn.Conv2d(1024, 4096, 3, 1, 1)
    self.relu6 = nn.ReLU()
    self.drop6 = nn.Dropout2d()

    self.fc2 = nn.Conv2d(4096, 4096, 1)
    self.relu7 = nn.ReLU()
    self.drop7 = nn.Dropout2d()

    self.fc3 = nn.Conv2d(4096, 32, 1)
    self.up = nn.ConvTranspose2d(32, 32, kernel_size=32, stride=32)

def forward(self, input):
    #print(input.shape)
    x = self.conv1_1(input)
    x = self.relu1_1(x)
    x = self.conv1_2(x)
    x = self.relu1_2(x)
    x = self.pool1(x)
    #print(x.shape)
    x = self.conv2_1(x)
    x = self.relu2_1(x)
    x = self.conv2_2(x)
    x = self.relu2_2(x)
    x = self.pool2(x)
    #print(x.shape)
    x = self.conv3_1(x)
    x = self.relu3_1(x)
    x = self.conv3_2(x)
    x = self.relu3_2(x)
    x = self.pool3(x)
    #print(x.shape)
    x = self.conv4_1(x)
    x = self.relu4_1(x)
    x = self.conv4_2(x)
    x = self.relu4_2(x)
    x = self.pool4(x)
    #print(x.shape)
    x = self.conv5_1(x)
    x = self.relu5_1(x)
    x = self.conv5_2(x)
    x = self.relu5_2(x)
    x = self.pool5(x)
    #print(x.shape)
    x = self.fc1(x)
    x = self.relu6(x)
    x = self.drop6(x)
    #print(x.shape)
    x = self.fc2(x)
    x = self.relu7(x)
    x = self.drop7(x)
    #print(x.shape)
    x = self.fc3(x)
    #print(x.shape)
    x = self.up(x)
    #print(x.shape)
    return x

I don’t know exactly which layers are new, but since the model architecture was changed, you could create a mapping between the old and new layer names and load each layer separately.
E.g something like this should work:

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 10)

class MyNewModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 10)
        self.new_layer = nn.Linear(1, 1)

model = MyModel()
new_model = MyNewModel()

# this fails
new_model.load_state_dict(model.state_dict())
# > RuntimeError: Error(s) in loading state_dict for MyNewModel:
#    Missing key(s) in state_dict: "new_layer.weight", "new_layer.bias". 
      
# this works, but could be dangerous, if you are not careful
new_model.load_state_dict(model.state_dict(), strict=False)
# > _IncompatibleKeys(missing_keys=['new_layer.weight', 'new_layer.bias'], unexpected_keys=[])

# check
print((new_model.fc1.weight == model.fc1.weight).all())
# > tensor(True)

# create mapping
mapping = [['fc1', 'fc1']]
for m in mapping:
    print('loading {} to {}'.format(m[1], m[0]))
    getattr(new_model, m[0]).load_state_dict(getattr(model, m[1]).state_dict())
1 Like

thank you for your help. i want to express my understandings so far since i am a beginner, please tell me if i am wrong.

  1. i cannot use pretrained weights on new image dimension directly since they are trained on 224 images.
  2. but we can use them if we extract the layers and add them into out new model. like this

class fcn(nn.Module):
def init(self):
super(fcn, self).init()
self.features = vgg16.features
self.classifier = nn.Sequential(nn.Conv2d(512, 4096, 7),nn.ReLU(inplace=True),nn.Conv2d(4096, 4096, 1),nn.ReLU(inplace=True),nn.Conv2d(4096, 32, 1),nn.ConvTranspose2d(32, 32, 224, stride=32)
)

def forward(self, x):
x = self.features(x)
x = self.classifier(x)
#print(x.shape)
return x

why doesnt it give error? am i just using it as features?
thank you

I don’t think these points are true, as you should be able to fine-tune the pretrained model using other input shapes as long as they are large enough (otherwise an intermediate activation could be reduced to an empty tensor and you would see an error).

1 Like

thank you. I understand, we should check the output size so that they dont vanish

@ptrblck hello, i have some issue with implementing fcn 32/16/8. I am using vgg16 pretrained weights and adding to my fcn model. For some reason my fcn 16 and 8 variations give bad results than fcn 32. Are these two commands same?

combined = pool4_scored + pool5_upscored
combined = torch.cat([pool4_scored, pool5_upscored])

thank you

No, these operations are not doing the same, as adding tensors via:

combined = pool4_scored + pool5_upscored

will perform the addition in each element while torch.cat will concatenate both tensors into a new (larger) tensor:

a = torch.ones(3)
b = torch.ones(3) * 2

combined = a + b
print(combined)
> tensor([3., 3., 3.])

combined = torch.cat((a, b))
print(combined)
> tensor([1., 1., 1., 2., 2., 2.])
1 Like

Thank you, can you please tell me which operation should I use for fcn architecture. I have used addition. I upscaled the vgg pool5 layers by 32 for fcn32 and gives good results. But when I add pool4 layers and (pool 5 up scale by 2) to get fcn16 output, it performs bad. According to the fcn architecture, fcn16 and fcn8 are supposed to work better.
this is my code,
import torch
import torch.nn as nn
import torchvision.models as models
from pytorch_model_summary import summary

vgg16 = models.vgg16(pretrained=True)
for param in vgg16.features.parameters():
param.requires_grad = False
#False Total params: 185,771,904 Trainable params: 171,057,216 Non-trainable params: 14,714,688
#true Total params: 185,771,904 Trainable params: 185,771,904 Non-trainable params: 0

class fcn(nn.Module):
def init(self):
super(fcn, self).init()
self.features = vgg16.features
self.classifier = nn.Sequential(
nn.Conv2d(512, 4096, 7),
nn.ReLU(inplace=True),
#nn.Dropout2d(),
nn.Conv2d(4096, 4096, 1),
nn.ReLU(inplace=True),
#nn.Dropout2d(),
nn.Conv2d(4096, 32, 1),
nn.ConvTranspose2d(32, 32, 224, stride=32)
)

def forward(self, x):
x = self.features(x)#/32
x = self.classifier(x)
#print(x.shape)
return x

class fcn16(nn.Module):
def init(self):
super(fcn16, self).init()
self.features = vgg16.features
self.classifier = nn.Sequential(
nn.Conv2d(512, 4096, 7),
nn.ReLU(inplace=True),
nn.Conv2d(4096, 4096, 1),
nn.ReLU(inplace=True),
nn.Conv2d(4096, 32, 1)
)
self.score_pool4 = nn.Conv2d(512, 32, 1)
self.upscore2 = nn.ConvTranspose2d(32, 32, 14, stride=2, bias=False)
self.upscore16 = nn.ConvTranspose2d(32, 32, 16, stride=16, bias=False)

def forward(self, x):
pool4 = self.features:-7#512 features /16
pool5 = self.features-7:#512 features /16/2=/32
pool5_upscored = self.upscore2(self.classifier(pool5))#32 class features stride2 /32*2=/16
pool4_scored = self.score_pool4(pool4)#32 features /16
combined = pool4_scored + pool5_upscored
#combined = torch.cat([pool4_scored, pool5_upscored])
res = self.upscore16(combined)# /1
return res

class fcn8(nn.Module):
def init(self):
super(fcn8, self).init()
self.features = vgg16.features
self.classifier = nn.Sequential(
nn.Conv2d(512, 4096, 7),
nn.ReLU(inplace=True),
nn.Conv2d(4096, 4096, 1),
nn.ReLU(inplace=True),
nn.Conv2d(4096, 32, 1)
)
self.score_pool4 = nn.Conv2d(512, 32, 1)
self.score_pool3 = nn.Conv2d(256, 32, 1)
self.upscore2 = nn.ConvTranspose2d(32, 32, 14, stride=2, bias=False)
self.upscore3 = nn.ConvTranspose2d(32, 32, 2, stride=2, bias=False)
#self.upscore16 = nn.ConvTranspose2d(32, 32, 16, stride=16, bias=False)
self.upscore8 = nn.ConvTranspose2d(32, 32, 8, stride=8, bias=False)

def forward(self, x):
pool3 = self.features:-14#256 features /8
pool4 = self.features-14:-7#512 features /8/2=16
pool5 = self.features-7:#512 features /16/2=/32
pool5_upscored = self.upscore2(self.classifier(pool5))#32 class features stride2 /322=/16
pool4_scored = self.score_pool4(pool4)#32 class features /16
pool3_scored = self.score_pool3(pool3)#32 class features /8
combined = pool4_scored + pool5_upscored #/16
#print(combined.shape)
combined_upscored = self.upscore3(combined)#32 class features stride2 /16
2=/8
#print(combined_upscored.shape)
combined2 = pool3_scored + combined_upscored
#print(combined2.shape)
#res = self.upscore16(combined)#/1
res = self.upscore8(combined2)#/1
#print(res.shape)
return res

Thank you

i tried this way of taking weights from pretrained model. But this gives slightly bad accuracy than the original model.

def forward(self, input):
    #print(input.shape)
    self.conv1_1.weight = vgg16.features[0].weight
    self.conv1_1.bias = vgg16.features[0].bias
    x = self.conv1_1(input)
    x = self.relu1_1(x)
    self.conv1_2.weight = vgg16.features[2].weight
    self.conv1_2.bias = vgg16.features[2].bias
    x = self.conv1_2(x)
    x = self.relu1_2(x)
    x = self.pool1(x)
    #print(x.shape)
    self.conv2_1.weight = vgg16.features[5].weight
    self.conv2_1.bias = vgg16.features[5].bias
    x = self.conv2_1(x)
    x = self.relu2_1(x)
    self.conv2_2.weight = vgg16.features[7].weight
    self.conv2_2.bias = vgg16.features[7].bias
    x = self.conv2_2(x)
    x = self.relu2_2(x)
    x = self.pool2(x)
    #print(x.shape)
    self.conv3_1.weight = vgg16.features[10].weight
    self.conv3_1.bias = vgg16.features[10].bias
    x = self.conv3_1(x)
    x = self.relu3_1(x)
    self.conv3_2.weight = vgg16.features[12].weight
    self.conv3_2.bias = vgg16.features[12].bias
    x = self.conv3_2(x)
    x = self.relu3_2(x)
    x = self.pool3(x)
    #print(x.shape)
    self.conv4_1.weight = vgg16.features[17].weight
    self.conv4_1.bias = vgg16.features[17].bias
    x = self.conv4_1(x)
    x = self.relu4_1(x)
    self.conv4_2.weight = vgg16.features[19].weight
    self.conv4_2.bias = vgg16.features[19].bias
    x = self.conv4_2(x)
    x = self.relu4_2(x)
    x = self.pool4(x)
    #print(x.shape)
    x = self.conv5_1(x)
    x = self.relu5_1(x)
    x = self.conv5_2(x)
    x = self.relu5_2(x)
    x = self.pool5(x)
    #print(x.shape)
    x = self.fc1(x)
    x = self.relu6(x)
    x = self.drop6(x)
    #print(x.shape)
    x = self.fc2(x)
    x = self.relu7(x)
    x = self.drop7(x)
    #print(x.shape)
    x = self.fc3(x)
    #print(x.shape)
    x = self.up(x)
    #print(x.shape)
    return x

I’m not sure if your current use case changed, but you should not reassign parameters in the forward method, as they won’t be trained (in case you are fine-tuning the model).
Did you try to use the previously posted approach?

previous method works good for fcn 32. but bad for fcn16 andf fcn8.

Could you explain a bit what “works bad” would mean in this context?
Is the parameter loading not working? If so, could you post a code snippet showing the errors?

this is my code. I get less accuracy for fcn16 and 8, fcn32 gives good accuracy.

class fcn8(nn.Module):
def **init**(self):
super(fcn8, self).**init**()
self.features = vgg16.features
self.classifier = nn.Sequential(
nn.Conv2d(512, 4096, 7),
nn.ReLU(inplace=True),
nn.Conv2d(4096, 4096, 1),
nn.ReLU(inplace=True),
nn.Conv2d(4096, 32, 1)
)
self.score_pool4 = nn.Conv2d(512, 32, 1)
self.score_pool3 = nn.Conv2d(256, 32, 1)
self.upscore2 = nn.ConvTranspose2d(32, 32, 14, stride=2, bias=False)
self.upscore3 = nn.ConvTranspose2d(32, 32, 2, stride=2, bias=False)
#self.upscore16 = nn.ConvTranspose2d(32, 32, 16, stride=16, bias=False)
self.upscore8 = nn.ConvTranspose2d(32, 32, 8, stride=8, bias=False)

def forward(self, x):
pool3 = self.features:-14#256 features /8
pool4 = self.features-14:-7#512 features /8/2=16
pool5 = self.features-7:#512 features /16/2=/32
pool5_upscored = self.upscore2(self.classifier(pool5))#32 class features stride2 /32<em>2=/16
pool4_scored = self.score_pool4(pool4)#32 class features /16
pool3_scored = self.score_pool3(pool3)#32 class features /8
combined = pool4_scored + pool5_upscored #/16
#print(combined.shape)
combined_upscored = self.upscore3(combined)#32 class features stride2 /16</em>2=/8
#print(combined_upscored.shape)
combined2 = pool3_scored + combined_upscored
#print(combined2.shape)
#res = self.upscore16(combined)#/1
res = self.upscore8(combined2)#/1
#print(res.shape)
return res

Thanks for the update. I can’t comment on the accuracy, as I don’t have any specific expectations which pretrained model should or should not work on which custom dataset.

i tried replacing conv2d with weights from vgg layers like this (copy output of pool layers by matching the output dimension of my model) but the accuracy is almost same, no significant improvement. Please help me, am i missing something? Is this correct way?

class fcn32(nn.Module):
    def __init__(self):
        super(fcn32, self).__init__()
        #self.conv1_1 = nn.Conv2d(3, 64, 3, 1, 1)
        self.conv1_1 = vgg16.features[0]
        #self.bn1_1 = nn.BatchNorm2d()
        self.relu1_1 = nn.ReLU()
        #self.conv1_2 = nn.Conv2d(64, 64, 3, 1, 1)
        self.conv1_2 = vgg16.features[2]
        #self.bn1_2 = nn.BatchNorm2d()
        self.relu1_2 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(2, 2)#/2

        #self.conv2_1 = nn.Conv2d(64, 128, 3, 1, 1)
        self.conv2_1 = vgg16.features[5]
        # self.bn2_1 = nn.BatchNorm2d()
        self.relu2_1 = nn.ReLU()
        #self.conv2_2 = nn.Conv2d(128, 128, 3, 1, 1)
        self.conv2_2 = vgg16.features[7]
        # self.bn2_2 = nn.BatchNorm2d()
        self.relu2_2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(2, 2)#/4

        #self.conv3_1 = nn.Conv2d(128, 256, 3, 1, 1)
        self.conv3_1 = vgg16.features[10]
        # self.bn3_1 = nn.BatchNorm2d()
        self.relu3_1 = nn.ReLU()
        #self.conv3_2 = nn.Conv2d(256, 256, 3, 1, 1)
        self.conv3_2 = vgg16.features[14]
        # self.bn3_2 = nn.BatchNorm2d()
        self.relu3_2 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(2, 2)#/8

        #self.conv4_1 = nn.Conv2d(256, 512, 3, 1, 1)
        self.conv4_1 = vgg16.features[17]
        # self.bn4_1 = nn.BatchNorm2d()
        self.relu4_1 = nn.ReLU()
        #self.conv4_2 = nn.Conv2d(512, 512, 3, 1, 1)
        self.conv4_2 = vgg16.features[28]
        # self.bn4_2 = nn.BatchNorm2d()
        self.relu4_2 = nn.ReLU()
        self.pool4 = nn.MaxPool2d(2, 2)  # /16

        self.conv5_1 = nn.Conv2d(512, 1024, 3, 1, 1)
        # self.bn5_1 = nn.BatchNorm2d()
        self.relu5_1 = nn.ReLU()
        self.conv5_2 = nn.Conv2d(1024, 1024, 3, 1, 1)
        # self.bn5_2 = nn.BatchNorm2d()
        self.relu5_2 = nn.ReLU()
        self.pool5 = nn.MaxPool2d(2, 2)  # /32

        self.fc1 = nn.Conv2d(1024, 4096, 3, 1, 1)
        self.relu6 = nn.ReLU()
        self.drop6 = nn.Dropout2d()

        self.fc2 = nn.Conv2d(4096, 4096, 1)
        self.relu7 = nn.ReLU()
        self.drop7 = nn.Dropout2d()

        self.fc3 = nn.Conv2d(4096, 32, 1)
        self.up = nn.ConvTranspose2d(32, 32, kernel_size=32, stride=32)

    def forward(self, input):
        #print(input.shape)

        x = self.conv1_1(input)
        x = self.relu1_1(x)

        x = self.conv1_2(x)
        x = self.relu1_2(x)
        x = self.pool1(x)
        #print(x.shape)

        x = self.conv2_1(x)
        x = self.relu2_1(x)

        x = self.conv2_2(x)
        x = self.relu2_2(x)
        x = self.pool2(x)
        #print(x.shape)

        x = self.conv3_1(x)
        x = self.relu3_1(x)

        x = self.conv3_2(x)
        x = self.relu3_2(x)
        x = self.pool3(x)
        #print(x.shape)

        x = self.conv4_1(x)
        x = self.relu4_1(x)

        x = self.conv4_2(x)
        x = self.relu4_2(x)
        x = self.pool4(x)
        #print(x.shape)
        x = self.conv5_1(x)
        x = self.relu5_1(x)
        x = self.conv5_2(x)
        x = self.relu5_2(x)
        x = self.pool5(x)
        #print(x.shape)
        x = self.fc1(x)
        x = self.relu6(x)
        x = self.drop6(x)
        #print(x.shape)
        x = self.fc2(x)
        x = self.relu7(x)
        x = self.drop7(x)
        #print(x.shape)
        x = self.fc3(x)
        #print(x.shape)
        x = self.up(x)
        #print(x.shape)
        return x

I still cannot comment on the model accuracy and what the expectations are.
Could you describe a bit more why your model architecture using the VGG16 features should increase the accuracy, i.e. are you trying to reimplement a knowingly good model based on previous work?