I am trying to use SAM optimizer when I use the backward function twice in train_epoch() # second forward-backward pass, it gives me an error otherwise it works fine.
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [512, 100]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
def test(models, dataloaders, mode='val'):
assert mode == 'val' or mode == 'test'
models.eval()
total = 0
correct = 0
with torch.no_grad():
for (inputs, labels) in dataloaders[mode]:
with torch.cuda.device(CUDA_VISIBLE_DEVICES):
inputs = inputs.cuda()
labels = labels.cuda()
scores, _, _ = models(inputs)
_, preds = torch.max(scores.data, 1)
total += labels.size(0)
correct += (preds == labels).sum().item()
return 100 * correct / total
iters = 0
def train_epoch(models, criterion, optimizers, dataloaders):
models.train()
global iters
for data in tqdm(dataloaders['train'], leave=False, total=len(dataloaders['train'])):
with torch.cuda.device(CUDA_VISIBLE_DEVICES):
inputs = data[0].cuda()
labels = data[1].cuda()
iters += 1
optimizers.zero_grad()
#pdb.set_trace()
scores, _, features = models(inputs)
target_loss = criterion(scores, labels)
m_backbone_loss = torch.sum(target_loss) / target_loss.size(0)
loss = m_backbone_loss
# -----------------SAM Optimizer -------------------
# first forward-backward pass
criterion(models(inputs)[0], labels)
loss.backward(retain_graph=True)
optimizers.first_step(zero_grad=True)
# second forward-backward pass
criterion(models(inputs)[0], labels)
#loss.backward(retain_graph=True)
optimizers.second_step(zero_grad=True)
#loss.backward()
#optimizers.step()
#return loss
def train(models, criterion, optimizers, schedulers, dataloaders, num_epochs, epoch_loss):
print('>> Train a Model.')
best_acc = 0.
for epoch in range(num_epochs):
best_loss = torch.tensor([0.5]).cuda()
loss = train_epoch(models, criterion, optimizers, dataloaders)
schedulers.step()
if False and epoch % 20 == 7:
acc = test(models, dataloaders, mode='test')
# acc = test(models, dataloaders, mc, 'test')
if best_acc < acc:
best_acc = acc
print('Val Acc: {:.3f} \t Best Acc: {:.3f}'.format(acc, best_acc))
print('>> Finished.')