Why is it when I call require_grad = False on all my params my weights in the network would still update?

Were these parameters trained before and are you using an optimizer with internal states, e.g. Adam?
If so, note that the running internal states might still update the frozen parameters, as seen in this code snippet:

# Setup
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.enc = nn.Linear(64, 10)
        self.dec = nn.Linear(10, 64)
        
    def forward(self, x):
        x = F.relu(self.enc(x))
        x = self.dec(x)

        return x


x = torch.randn(1, 64)
y = x.clone()
model = MyModel()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1.)

# dummy updates
for idx in range(10):
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    print('Iter{}, loss {}'.format(idx, loss.item()))

optimizer.zero_grad()
# Freeze encoder
for param in model.enc.parameters():
    param.requires_grad_(False)

# Store reference parameter
enc_weight0 = model.enc.weight.clone()

# Update for more iterations
for idx in range(10):
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    print('Iter{}, loss {}'.format(idx, loss.item()))
    print('max abs diff in enc.weight {}'.format(
        (enc_weight0 - model.enc.weight).abs().max()))
    print('sum abs grad in enc.weight {}'.format(
        model.enc.weight.grad.abs().sum()))
1 Like