Hi, I’m trying to use DistributedDataParallel for adversarial training.
My code works perfectly fine before but facing an error when converting it to DDP.
The error message:
[W python_anomaly_mode.cpp:104] Warning: Error detected in CudnnBatchNormBackward0. Traceback of forward call that caused the error:
File "AT.py", line 253, in <module>
main()
File "AT.py", line 235, in main
train(args, model, device, train_loader, optimizer, epoch)
File "AT.py", line 117, in train
logit_nat = model(data)
File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hao/xh_Adversarial_Performance_Eval/model/wideresnet.py", line 87, in forward
out = self.relu(self.bn1(out))
File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 179, in forward
self.eps,
File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/functional.py", line 2283, in batch_norm
input, weight, bias, running_mean, running_var, training, momentum, eps, torch.backends.cudnn.enabled
(function _print_stack)
Traceback (most recent call last):
File "AT.py", line 253, in <module>
main()
File "AT.py", line 235, in main
train(args, model, device, train_loader, optimizer, epoch)
File "AT.py", line 124, in train
loss.backward()
File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/autograd/__init__.py", line 156, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [640]] is at version 13; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
My training code:
def train(args, model, device, train_loader, optimizer, epoch):
model.train()
beta = 1
loss_nat_sum = 0
loss_adv_sum = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
logit_nat = model(data)
loss_nat = F.cross_entropy(logit_nat, target)
x_adv = pgd_resnet(model, data, target, args.step_size, args.epsilon, args.num_steps)
logit_adv = model(x_adv)
loss_adv = F.cross_entropy(logit_adv, target)
loss = loss_nat + beta * loss_adv
loss.backward()
optimizer.step()
loss_nat_sum += loss_nat.item()
loss_adv_sum += loss_adv.item()
loss_nat_final = loss_nat_sum/len(train_loader.dataset)
loss_adv_final = loss_adv_sum/len(train_loader.dataset)
print('Train Epoch: {} \tNatural Loss: {:.6f}, Adv Loss: {:.6f}'.format(epoch, loss_nat_final, loss_adv_final), file=log_file, flush=True)
I noticed the error only occurs when I add two losses together. If I do loss_nat.backward()
or loss_adv.backward()
alone, it works fine.
Thanks for any advice.