How do I pass pre--trained model weights to a new model layer by layer for initialization

I have a pre-trained Unet model. And there is a same architecture model without training. I am trying to initialize the new Unet model weights based on the pre-trained Unet model. For some seasons, I need to sign the weights layer by layer.

Here is my code:
device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)
print(‘Device:’, device)

unet_model1 = unet.UNet(in_channels=3, out_channels=3, init_features=32)
unet_model2 = unet.UNet(in_channels=3, out_channels=3, init_features=32)

unet_model1 = unet_model1.to(device) ### Pre-trained unet
unet_model2 = unet_model2.to(device)

checkpoint = torch.load(‘best_model.pth’)

unet_model1.load_state_dict(checkpoint[‘model’])
list_of_weights = [x for x in unet_model1.parameters()]

with torch.no_grad():
unet_model2.encoder1[0].weight.copy_(list_of_weights[0])
unet_model2.encoder1[1].weight.copy_(list_of_weights[1])
unet_model2.encoder1[1].bias.copy_(list_of_weights[2])
unet_model2.encoder1[3].weight.copy_(list_of_weights[3])
unet_model2.encoder1[4].weight.copy_(list_of_weights[4])
unet_model2.encoder1[4].bias.copy_(list_of_weights[5])

 unet_model2.encoder2[0].weight.copy_(list_of_weights[6])
 unet_model2.encoder2[1].weight.copy_(list_of_weights[7])
 unet_model2.encoder2[1].bias.copy_(list_of_weights[8])
 unet_model2.encoder2[3].weight.copy_(list_of_weights[9])
 unet_model2.encoder2[4].weight.copy_(list_of_weights[10])
 unet_model2.encoder2[4].bias.copy_(list_of_weights[11])

 unet_model2.encoder3[0].weight.copy_(list_of_weights[12])
 unet_model2.encoder3[1].weight.copy_(list_of_weights[13])
 unet_model2.encoder3[1].bias.copy_(list_of_weights[14])
 unet_model2.encoder3[3].weight.copy_(list_of_weights[15])
 unet_model2.encoder3[4].weight.copy_(list_of_weights[16])
 unet_model2.encoder3[4].bias.copy_(list_of_weights[17])

 unet_model2.encoder4[0].weight.copy_(list_of_weights[18])
 unet_model2.encoder4[1].weight.copy_(list_of_weights[19])
 unet_model2.encoder4[1].bias.copy_(list_of_weights[20])
 unet_model2.encoder4[3].weight.copy_(list_of_weights[21])
 unet_model2.encoder4[4].weight.copy_(list_of_weights[22])
 unet_model2.encoder4[4].bias.copy_(list_of_weights[23])

 unet_model2.bottleneck[0].weight.copy_(list_of_weights[24])
 unet_model2.bottleneck[1].weight.copy_(list_of_weights[25])
 unet_model2.bottleneck[1].bias.copy_(list_of_weights[26])
 unet_model2.bottleneck[3].weight.copy_(list_of_weights[27])
 unet_model2.bottleneck[4].weight.copy_(list_of_weights[28])
 unet_model2.bottleneck[4].bias.copy_(list_of_weights[29])

 unet_model2.upconv4.weight.copy_(list_of_weights[30])
 unet_model2.upconv4.bias.copy_(list_of_weights[31])

 unet_model2.decoder4[0].weight.copy_(list_of_weights[32])
 unet_model2.decoder4[1].weight.copy_(list_of_weights[33])
 unet_model2.decoder4[1].bias.copy_(list_of_weights[34])
 unet_model2.decoder4[3].weight.copy_(list_of_weights[35])
 unet_model2.decoder4[4].weight.copy_(list_of_weights[36])
 unet_model2.decoder4[4].bias.copy_(list_of_weights[37])

 unet_model2.upconv3.weight.copy_(list_of_weights[38])
 unet_model2.upconv3.bias.copy_(list_of_weights[39])

 unet_model2.decoder3[0].weight.copy_(list_of_weights[40])
 unet_model2.decoder3[1].weight.copy_(list_of_weights[41])
 unet_model2.decoder3[1].bias.copy_(list_of_weights[42])
 unet_model2.decoder3[3].weight.copy_(list_of_weights[43])
 unet_model2.decoder3[4].weight.copy_(list_of_weights[44])
 unet_model2.decoder3[4].bias.copy_(list_of_weights[45])

 unet_model2.upconv2.weight.copy_(list_of_weights[46])
 unet_model2.upconv2.bias.copy_(list_of_weights[47])

 unet_model2.decoder2[0].weight.copy_(list_of_weights[48])
 unet_model2.decoder2[1].weight.copy_(list_of_weights[49])
 unet_model2.decoder2[1].bias.copy_(list_of_weights[50])
 unet_model2.decoder2[3].weight.copy_(list_of_weights[51])
 unet_model2.decoder2[4].weight.copy_(list_of_weights[52])
 unet_model2.decoder2[4].bias.copy_(list_of_weights[53])
 unet_model2.upconv1.weight.copy_(list_of_weights[54])
 unet_model2.upconv1.bias.copy_(list_of_weights[55])

 unet_model2.decoder1[0].weight.copy_(list_of_weights[56])
 unet_model2.decoder1[1].weight.copy_(list_of_weights[57])
 unet_model2.decoder1[1].bias.copy_(list_of_weights[58])
 unet_model2.decoder1[3].weight.copy_(list_of_weights[59])
 unet_model2.decoder1[4].weight.copy_(list_of_weights[60])
 unet_model2.decoder1[4].bias.copy_(list_of_weights[61])

 unet_model2.conv.weight.copy_(list_of_weights[62])
 unet_model2.conv.bias.copy_(list_of_weights[63])

After I did all the above, I test the weight for each layer by:

layer1 = [x for x in unet_model1.named_parameters()]
layer2 = [x for x in unet_model2.named_parameters()]

for i in range(len(layer2)):

weight1 = layer1[i][1]
weight2 = layer2[i][1]

print(weight2==weight1)

Now for the two models, they have the same weights (weights and bias) for each layer.

But my current issue is if I am using those two models to make predictions on the same image, they would give different results.

The test code is below:
unet_model1.eval()
unet_model2.eval()

running_corrects1=0
running_corrects2=0

idx = 0
with torch.no_grad():
for sample in dataloaders:

     idx=idx+1

     test_img = sample['A'].to(device)
     labels = (sample['L']).to(device)

     outputs1 = unet_model1(test_img)
     outputs2 = unet_model2(test_img)

     preds1 = torch.round(outputs1)
     preds2 = torch.round(outputs2)

     running_corrects1 = torch.sum(preds1 == labels.data)
     running_corrects2 = torch.sum(preds2 == labels.data)

     print('model test:')
     print(running_corrects1)
     print(running_corrects2)

     if running_corrects1 != running_corrects2:

         print('problem')

Could you help me to figure out what’s wrong with my problem? Thank you so much.

Could you give more information about the visible error using both approaches?
You might be running into expected small errors caused by the limited numerical precision caused by e.g. a different order of operations as seen in this small example:

x = torch.randn(100, 100, 100)
s1 = x.sum()
s2 = x.sum(0).sum(0).sum(0)
s1 - s2
# tensor(0.0001)

If you want to use deterministic algorithms only, you could use torch.use_deterministic_algorithms.

Hi ptrblck,

Thanks for your response.
If I run the above code, it doesn’t give any visible error.
The issue is just, given one input image, the pre-trained model could make a reasonable prediction. But the new model could only predict everything to pure white. So I guess it should not be limited numerical precision.

Yes, the numerical difference should be in the relative expected range depending on the dtype and should not change the prediction. If your new model is creating entirely different outputs you could e.g. add print statements to the forward method comparing the intermediate activations between both models to isolate where the difference is coming from.

Hi ptrblck,

Here is my current solution to solve this issue. It is not the perfect way for my purpose. But after testing, now the two models could make the same prediction.

I define a new unet class at first:
class UNet_test(nn.Module):

def __init__(self, in_channels=3, out_channels=1, init_features=32):
    super(UNet_test, self).__init__()

    self.unet = unet.UNet(in_channels=3, out_channels=3, init_features=32)
    self.checkpoint = torch.load('best_model.pth')
    self.unet.load_state_dict(self.checkpoint['model'])

    self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
    self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
    self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
    self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)

def forward(self, x1):
    x = self.forward_single(x1)
    return x

def forward_single(self, x):
    enc1 = self.unet.encoder1(x)
    enc2 = self.unet.encoder2(self.pool1(enc1))
    enc3 = self.unet.encoder3(self.pool2(enc2))
    enc4 = self.unet.encoder4(self.pool3(enc3))
    bottleneck = self.unet.bottleneck(self.pool4(enc4))

    dec4 = self.unet.upconv4(bottleneck)
    dec4 = torch.cat((dec4, enc4), dim=1)
    dec4 = self.unet.decoder4(dec4)
    dec3 = self.unet.upconv3(dec4)
    dec3 = torch.cat((dec3, enc3), dim=1)
    dec3 = self.unet.decoder3(dec3)
    dec2 = self.unet.upconv2(dec3)
    dec2 = torch.cat((dec2, enc2), dim=1)
    dec2 = self.unet.decoder2(dec2)
    dec1 = self.unet.upconv1(dec2)
    dec1 = torch.cat((dec1, enc1), dim=1)
    dec1 = self.unet.decoder1(dec1)
    return torch.sigmoid(self.unet.conv(dec1))

After that, I define the two models like this:

device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)
print(‘Device:’, device)

unet_model1 = unet.UNet(in_channels=3, out_channels=3, init_features=32)
unet_model2 = unet_weight.UNet_test(in_channels=3, out_channels=3, init_features=32)

unet_model1 = unet_model1.to(device)
unet_model2 = unet_model2.to(device)

checkpoint = torch.load(‘best_model.pth’)
unet_model1.load_state_dict(checkpoint[‘model’])

Now, the two models could make the same prediction. :smile:

Good to hear you’ve solved the issue! Was the difference in the model outputs caused by the manual parameter loading approach?

I guess so. But when I check the manual parameter loading approach, the weights for the two models are actually the same. It is weird.