DDP gradient inplace error

Hi, I’m trying to use DistributedDataParallel for adversarial training.
My code works perfectly fine before but facing an error when converting it to DDP.

The error message:

[W python_anomaly_mode.cpp:104] Warning: Error detected in CudnnBatchNormBackward0. Traceback of forward call that caused the error:
  File "AT.py", line 253, in <module>
  File "AT.py", line 235, in main
    train(args, model, device, train_loader, optimizer, epoch)
  File "AT.py", line 117, in train
    logit_nat = model(data)
  File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
    output = self.module(*inputs[0], **kwargs[0])
  File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/hao/xh_Adversarial_Performance_Eval/model/wideresnet.py", line 87, in forward
    out = self.relu(self.bn1(out))
  File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 179, in forward
  File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/nn/functional.py", line 2283, in batch_norm
    input, weight, bias, running_mean, running_var, training, momentum, eps, torch.backends.cudnn.enabled
 (function _print_stack)
Traceback (most recent call last):
  File "AT.py", line 253, in <module>
  File "AT.py", line 235, in main
    train(args, model, device, train_loader, optimizer, epoch)
  File "AT.py", line 124, in train
  File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/_tensor.py", line 307, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/hao/anaconda3/envs/test/lib/python3.7/site-packages/torch/autograd/__init__.py", line 156, in backward
    allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [640]] is at version 13; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

My training code:

def train(args, model, device, train_loader, optimizer, epoch):
    beta = 1
    loss_nat_sum = 0
    loss_adv_sum = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        logit_nat = model(data)
        loss_nat = F.cross_entropy(logit_nat, target)
        x_adv = pgd_resnet(model, data, target, args.step_size, args.epsilon, args.num_steps)
        logit_adv = model(x_adv)
        loss_adv = F.cross_entropy(logit_adv, target)
        loss = loss_nat + beta * loss_adv
        loss_nat_sum += loss_nat.item()
        loss_adv_sum += loss_adv.item()
    loss_nat_final = loss_nat_sum/len(train_loader.dataset)
    loss_adv_final = loss_adv_sum/len(train_loader.dataset)
    print('Train Epoch: {} \tNatural Loss: {:.6f}, Adv Loss: {:.6f}'.format(epoch, loss_nat_final, loss_adv_final), file=log_file, flush=True)

I noticed the error only occurs when I add two losses together. If I do loss_nat.backward() or loss_adv.backward() alone, it works fine.

Thanks for any advice.

I cannot see any obvious issues in your code so could you post a minimal, executable code snippet to reproduce the error, please?

Hi, sorry for the very late response. I was catching the deadline for a conference.

I manage to create an executable code that will produce the same error on my side using the command torchrun --nproc_per_node=2 test.py

import torch
import torch.nn as nn
import torchvision
import torch.nn.functional as F
from torchvision import transforms
import torch.optim as optim
import os
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
transform_test = transforms.Compose([

rank = os.environ["RANK"]
device = torch.device(f'cuda:{int(rank)}')

use_cuda = torch.cuda.is_available()
kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}

trainset = torchvision.datasets.CIFAR10(root='../data', train=True, download=True, transform=transform_train)
train_sampler = torch.utils.data.distributed.DistributedSampler(trainset, shuffle=True)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=128, sampler=train_sampler, **kwargs)
testset = torchvision.datasets.CIFAR10(root='../data', train=False, download=True, transform=transform_test)
test_sampler = torch.utils.data.distributed.DistributedSampler(testset, shuffle=False)
test_loader = torch.utils.data.DataLoader(testset, batch_size=128, sampler=test_sampler, **kwargs)

class test_model(nn.Module):
    def __init__(self):
        super(test_model, self).__init__()
        self.layer1 = nn.Conv2d(3, 8, kernel_size=3, stride=2, padding=1)
        self.norm = nn.BatchNorm2d(8)
        self.layer2 = nn.Linear(8*16*16, 10)
    def forward(self, x):
        x = F.relu(self.norm(self.layer1(x)))
        x = x.reshape(-1, 8*16*16)
        x = self.layer2(x)
        return x

def train(model, opt, data_loader):
    loss_sum = 0
    for batch_idx, (data, target) in enumerate(data_loader):
        data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
        x_adv = pgd_attack(model, data, target, 1./255., 8./255., 10, device)
        loss_nat = nn.CrossEntropyLoss()(model(data), target)
        loss_adv = nn.CrossEntropyLoss()(model(x_adv), target)
        loss = loss_nat + loss_adv
        loss_sum += loss
    print('train loss:{}'.format(loss_sum/len(data_loader.dataset)))

def pgd_attack(model, x, y, step_size=0.003,
    criterion_ce = nn.CrossEntropyLoss(size_average=False)
    # generate adversarial example
    x_adv = x.detach() + 0.001 * torch.randn(x.shape).to(device).detach()
    for _ in range(perturb_steps):
        with torch.enable_grad():
            loss_kl = criterion_ce(model(x_adv), y)
        grad = torch.autograd.grad(loss_kl, [x_adv])[0]
        x_adv = x_adv.detach() + step_size * torch.sign(grad.detach())
        x_adv = torch.min(torch.max(x_adv, x - epsilon), x + epsilon)
        x_adv = torch.clamp(x_adv, 0.0, 1.0)
    return x_adv

def main():
    model = test_model().to(device)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[int(rank)], find_unused_parameters=False)
    opt = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
    print('training mode')
    for epoch in range(100):
        print('epoch: ', epoch)
        train(model, opt, train_loader)
    torch.save(model.state_dict(), 'model.pt')
if __name__ == '__main__':

I also find that the error wouldn’t occur when the model only contains linear layers.

Hope this information helps.

@ptrblck Sorry, it seems the error will occur when containing BatchNorm layer instead of Conv layer.

Thanks for sharing the code and he ping!
I can reproduce the issue and am currently unsure what exactly is causing it.
@kwen2501 do you know if the multiple forward passes might be causing the issue?
If I remove the second forward and just replace it with a constant the code seems to work:

        loss_nat = nn.CrossEntropyLoss()(model(data), target)
        loss_adv = 0.#nn.CrossEntropyLoss()(model(x_adv), target)

Hi, fortunately, I managed to find the solution. Replacing all the BatchNorm layers with SyncBatchNorm seems to solve the problem. Not sure why though.

That’s good to hear, but I still don’t fully understand why the inplace error is raised when plain batchnorm layers are used.