I took your code snippet and modified it a little. I am trying to freeze first three layers for initial 5 epochs and then train the complete model. But my model is not working as expected for training the complete model for all the epochs. Do I need to reinitialize the optimizer after 5 epochs? or I m missing something else.
Note: I have studied the add_param_group option but do you think that will be feasible if Im training the very big models like pretrained ResNet152 as encoder in encoder-decoder models?
Thanks in advance.
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
# toy feed-forward net
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(10, 3)
self.fc2 = nn.Linear(3, 3)
self.fc3 = nn.Linear(3, 3)
self.fc4 = nn.Linear(3, 3)
self.fc5 = nn.Linear(3, 1)
def forward(self, x):
x = self.fc1(x)
x = self.fc2(x)
x = self.fc3(x)
x = self.fc4(x)
x = self.fc5(x)
return x
net = Net()
# print the pre-trained fc2 weight
print('fc2 pretrained weight')
print(net.fc2.weight)
# define new random data
random_input = Variable(torch.randn(10,))
random_target = Variable(torch.randn(1,))
# loss
criterion = nn.MSELoss()
# NOTE: pytorch optimizer explicitly accepts parameter that requires grad
# see https://github.com/pytorch/pytorch/issues/679
optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.1)
# this raises ValueError: optimizing a parameter that doesn't require gradients
#optimizer = optim.Adam(net.parameters(), lr=0.1)
for epoch in range(1,10):
net.zero_grad()
count = 0
if epoch < 5:
# freeze backbone layers
for param in net.children():
count +=1
if count < 4: #freezing first 3 layers
param.requires_grad = False
else:
for param in net.children():
param.requires_grad = True
# for param in net.children():
# print(param,param.requires_grad)
print('trainable parameters', count_parameters(net))
output = net(random_input)
loss = criterion(output, random_target)
loss.backward()
optimizer.step()
print('fc2 weight at epoch:', epoch)
print(net.fc2.weight)
Output:
fc2 pretrained weight
Parameter containing:
tensor([[ 0.5127, 0.1465, -0.5701],
[-0.3253, -0.1051, -0.3173],
[-0.0262, 0.2804, -0.0923]], requires_grad=True)
trainable parameters 73
fc2 weight at epoch: 1
Parameter containing:
tensor([[ 0.4127, 0.2465, -0.6701],
[-0.4253, -0.0051, -0.4173],
[-0.1262, 0.3804, -0.1923]], requires_grad=True)
trainable parameters 73
fc2 weight at epoch: 2
Parameter containing:
tensor([[ 0.3130, 0.3466, -0.7702],
[-0.3617, -0.0704, -0.3515],
[-0.0610, 0.3138, -0.1251]], requires_grad=True)
trainable parameters 73
fc2 weight at epoch: 3
Parameter containing:
tensor([[ 0.2345, 0.4255, -0.8493],
[-0.3122, -0.1212, -0.3002],
[-0.0103, 0.2619, -0.0729]], requires_grad=True)
trainable parameters 73
fc2 weight at epoch: 4
Parameter containing:
tensor([[ 0.1776, 0.4843, -0.9030],
[-0.2739, -0.1612, -0.2610],
[ 0.0299, 0.2205, -0.0319]], requires_grad=True)
trainable parameters 73
fc2 weight at epoch: 5
Parameter containing:
tensor([[ 0.1263, 0.5314, -0.9565],
[-0.2419, -0.1954, -0.2290],
[ 0.0637, 0.1853, 0.0025]], requires_grad=True)
trainable parameters 73
fc2 weight at epoch: 6
Parameter containing:
tensor([[ 0.0908, 0.5378, -1.0103],
[-0.2141, -0.2259, -0.2017],
[ 0.0928, 0.1551, 0.0321]], requires_grad=True)
trainable parameters 73
fc2 weight at epoch: 7
Parameter containing:
tensor([[ 0.0888, 0.5036, -1.0591],
[-0.1893, -0.2535, -0.1778],
[ 0.1180, 0.1293, 0.0580]], requires_grad=True)
trainable parameters 73
fc2 weight at epoch: 8
Parameter containing:
tensor([[ 0.1142, 0.4492, -1.1039],
[-0.1671, -0.2785, -0.1568],
[ 0.1398, 0.1071, 0.0808]], requires_grad=True)
trainable parameters 73
fc2 weight at epoch: 9
Parameter containing:
tensor([[ 0.1440, 0.3941, -1.1453],
[-0.1473, -0.3008, -0.1382],
[ 0.1591, 0.0876, 0.1011]], requires_grad=True)