How to freeze all and progressively unfreeze layers of a model for transfert learning

SaveYourLifes · June 16, 2022, 9:08am

Hello there, I’m quite new to pytorch sorry if it is a simple mistake.
I’m trying to implement transfert learnings on a multilabel language classifier, to to that effectively I want to compute the gradient in steps. I’m using the tutorial at

Transfer Learning for Computer Vision Tutorial — PyTorch Tutorials 2.1.1+cu121 documentation
but I get the error

you can only change requires_grad flags of leaf variables. If you want to use a computed variable in a subgraph that doesn’t require differentiation use var_no_grad = var.detach().

I don’t understand how to use the then detached tensors

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.AdamW(params =  model.parameters(), 
                              lr=LEARNING_RATE, eps = 1e-8, 
                              weight_decay = 0.1)

def trainV2(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch} , Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()

total_steps = TRAIN_BATCH_SIZE * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

parameters = [param for param in model.parameters()]

for param in parameters:
    param.requires_grad = True

def trainSchedule(optimizer, scheduler):
        
    model.train()
    
    parameters = [param for param in model.parameters()]
    
    for param in parameters[-1][:
        param.requires_grad = False
    lr = 1e-1
    #self.learner.fit_one_cycle(1, lr, moms = (0.8, 0.7), wd = 0.1)    
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer = optimizer, 
                                                    epochs = 1, 
                                                    steps_per_epoch = len(training_loader), 
                                                    max_lr = lr, 
                                                    base_momentum = 0.7,
                                                    max_momentum = 0.8)

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Input In [18], in <cell line: 1>()
----> 1 trainSchedule(optimizer, scheduler)

Input In [17], in trainSchedule(optimizer, scheduler)
     16 parameters = [param for param in model.parameters()]
     18 for param in parameters[-1]:
---> 19     param.requires_grad = False
     20 lr = 1e-1
     21 #self.learner.fit_one_cycle(1, lr, moms = (0.8, 0.7), wd = 0.1)    

RuntimeError: you can only change requires_grad flags of leaf variables. If you want to use a computed variable in a subgraph that doesn't require differentiation use var_no_grad = var.detach().