Trying to use SAM optimizer for Random Sampling Image Classification

KanZa · August 20, 2022, 9:04am

I am trying to use SAM optimizer when I use the backward function twice in train_epoch() # second forward-backward pass, it gives me an error otherwise it works fine.

Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [512, 100]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).


def test(models, dataloaders, mode='val'):
    assert mode == 'val' or mode == 'test'
    models.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for (inputs, labels) in dataloaders[mode]:
            with torch.cuda.device(CUDA_VISIBLE_DEVICES):
                inputs = inputs.cuda()
                labels = labels.cuda()
            scores, _, _  = models(inputs)
            
            _, preds = torch.max(scores.data, 1)
            total += labels.size(0)
            correct += (preds == labels).sum().item()
    
    return 100 * correct / total


iters = 0
def train_epoch(models, criterion, optimizers, dataloaders):

    models.train()
    
    global iters
    for data in tqdm(dataloaders['train'], leave=False, total=len(dataloaders['train'])):
        with torch.cuda.device(CUDA_VISIBLE_DEVICES):
            inputs = data[0].cuda()
            labels = data[1].cuda()
        iters += 1
        optimizers.zero_grad()  
        #pdb.set_trace()      
        scores, _, features = models(inputs) 
        
        target_loss = criterion(scores, labels)
        m_backbone_loss = torch.sum(target_loss) / target_loss.size(0)        
        loss  = m_backbone_loss
         # -----------------SAM Optimizer -------------------
        # first forward-backward pass
        criterion(models(inputs)[0], labels)
        loss.backward(retain_graph=True)
        optimizers.first_step(zero_grad=True)
        
        # second forward-backward pass
        criterion(models(inputs)[0], labels)
        #loss.backward(retain_graph=True)
        optimizers.second_step(zero_grad=True)

        #loss.backward()
        #optimizers.step()            
    #return loss

def train(models, criterion, optimizers, schedulers, dataloaders, num_epochs, epoch_loss):
    print('>> Train a Model.')
    best_acc = 0.
    
    for epoch in range(num_epochs):

        best_loss = torch.tensor([0.5]).cuda()
        loss = train_epoch(models, criterion, optimizers, dataloaders)
        schedulers.step()
        
        if False and epoch % 20  == 7:
            acc = test(models, dataloaders, mode='test')
            # acc = test(models, dataloaders, mc, 'test')
            if best_acc < acc:
                best_acc = acc
                print('Val Acc: {:.3f} \t Best Acc: {:.3f}'.format(acc, best_acc))
    print('>> Finished.')

ptrblck · August 20, 2022, 6:32pm

In “second forward-backward pass” you are still using the loss value which was calculated by criterion(scores, labels) and try to calculate the gradients using its stored forward activations again.
However, between the first loss.backward() call and the second one you’ve already updated (some) parameters, which would make the forward activations stale. Trying to use these stale forward activations and the already updated parameters is wrong and thus the error is raised.
This post describes the issue in more detail.