Were these parameters trained before and are you using an optimizer with internal states, e.g. Adam
?
If so, note that the running internal states might still update the frozen parameters, as seen in this code snippet:
# Setup
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.enc = nn.Linear(64, 10)
self.dec = nn.Linear(10, 64)
def forward(self, x):
x = F.relu(self.enc(x))
x = self.dec(x)
return x
x = torch.randn(1, 64)
y = x.clone()
model = MyModel()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1.)
# dummy updates
for idx in range(10):
optimizer.zero_grad()
output = model(x)
loss = criterion(output, y)
loss.backward()
optimizer.step()
print('Iter{}, loss {}'.format(idx, loss.item()))
optimizer.zero_grad()
# Freeze encoder
for param in model.enc.parameters():
param.requires_grad_(False)
# Store reference parameter
enc_weight0 = model.enc.weight.clone()
# Update for more iterations
for idx in range(10):
optimizer.zero_grad()
output = model(x)
loss = criterion(output, y)
loss.backward()
optimizer.step()
print('Iter{}, loss {}'.format(idx, loss.item()))
print('max abs diff in enc.weight {}'.format(
(enc_weight0 - model.enc.weight).abs().max()))
print('sum abs grad in enc.weight {}'.format(
model.enc.weight.grad.abs().sum()))