Understanding 'Buckets with more than one variable cannot include variables that expect a sparse gradient'

Hi everyone! I was using DDP to train DCGAN, which worked fine(with the same network as DCGAN Tutorial — PyTorch Tutorials 1.9.0+cu102 documentation).
However, when I removed part of the code (remove update step for generator), I got this error Buckets with more than one variable cannot include variables that expect a sparse gradient. To find out the reason, I commented out the code inside with torch.no_grad() and error disappeared, which confuses me. Maybe the possible cause is my use of torch.no_grad()? But how did it even work before I removed the generator update step?

Anyone can help me understand what is going on here? How could simply removing one update step possibly lead to an error?

The error I got

**** line 63, in subprocess_fn
real_pred = discriminator(real_data)
  File "/home/azav/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/azav/.local/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 692, in forward
    if self.reducer._rebuild_buckets():
RuntimeError: Buckets with more than one variable cannot include variables that expect a sparse gradient.

I compared the code with error and code without error in the following two blocks.

Code with error:

import os

import torch
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import Adam

import torchvision.utils as vutil
from models.dcgan import Generator, Discriminator
from utils.parser import train_base


def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '7777'
    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)


def cleanup():
    dist.destroy_process_group()


def subprocess_fn(rank, args):
    setup(rank, args.num_gpus)
    print(f'running on rank {rank}')
    generator = Generator().to(rank)
    discriminator = Discriminator().to(rank)

    if args.distributed:
        generator = DDP(generator, device_ids=[rank], broadcast_buffers=False)
        discriminator = DDP(discriminator, device_ids=[rank], broadcast_buffers=False)
    d_optim = Adam(discriminator.parameters(), lr=2e-4)
    g_optim = Adam(generator.parameters(), lr=2e-4)

    discriminator.train()
    generator.train()
    if rank == 0:
        fixed_z = torch.randn(64, 100, 1, 1).to(rank)

    pbar = range(args.iter)
    for e in pbar:
        real_data = torch.randn((args.batchsize, 3, 64, 64)).to(rank)
        real_pred = discriminator(real_data)

        latent = torch.randn((args.batchsize, 100, 1, 1)).to(rank)
        fake_data = generator(latent)
        fake_pred = discriminator(fake_data)

        d_loss = d_logistic_loss(real_pred, fake_pred)

        d_optim.zero_grad()
        d_loss.backward()
        d_optim.step()

        if rank == 0 and e % 100 == 0:
            print(f'Epoch D loss:{d_loss.item()};')
            with torch.no_grad():
                imgs = generator(fixed_z)
                vutil.save_image(imgs, f'str(e).zfill(5)}.png', normalize=True)

    cleanup()
    print(f'Process {rank} exits...')


if __name__ == '__main__':
    parser = train_base()
    args = parser.parse_args()
    args.distributed = args.num_gpus > 1

    if args.distributed:
        mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
    else:
        subprocess_fn(0, args)

    print('Done!')

Code without error:

import torch
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import Adam
from torch.utils.data import DataLoader
import torchvision.utils as vutil

from models.dcgan import Generator, Discriminator
from utils.parser import train_base


def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '7777'
    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)


def cleanup():
    dist.destroy_process_group()


def subprocess_fn(rank, args):
    setup(rank, args.num_gpus)
    generator = Generator().to(rank)
    discriminator = Discriminator().to(rank)

    if args.distributed:
        generator = DDP(generator, device_ids=[rank], broadcast_buffers=False)
        discriminator = DDP(discriminator, device_ids=[rank], broadcast_buffers=False)
    d_optim = Adam(discriminator.parameters(), lr=2e-4)
    g_optim = Adam(generator.parameters(), lr=2e-4)

    discriminator.train()
    generator.train()
    if rank == 0:
        fixed_z = torch.randn(64, 100, 1, 1).to(rank)

    pbar = range(args.iter)
    for e in pbar:
        real_data = torch.randn((args.batchsize, 3, 64, 64)).to(rank)
        real_pred = discriminator(real_data)

        latent = torch.randn((args.batchsize, 100, 1, 1)).to(rank)
        fake_data = generator(latent)
        fake_pred = discriminator(fake_data)

        d_loss = d_logistic_loss(real_pred, fake_pred)

        d_optim.zero_grad()
        d_loss.backward()
        d_optim.step()

        latent = torch.randn((args.batchsize, 100, 1, 1)).to(rank)
        fake_data = generator(latent)
        fake_pred = discriminator(fake_data)
        g_loss = g_nonsaturating_loss(fake_pred)

        g_optim.zero_grad()
        g_loss.backward()
        g_optim.step()

        if rank == 0 and e % 100 == 0:
            print(f'Epoch D loss:{d_loss.item()};')
            with torch.no_grad():
                imgs = generator(fixed_z)
                vutil.save_image(imgs, f'{str(e).zfill(5)}.png', normalize=True)

    cleanup()
    print(f'Process {rank} exits...')


if __name__ == '__main__':
    parser = train_base()

    args = parser.parse_args()
    args.distributed = args.num_gpus > 1

    if args.distributed:
        mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
    else:
        subprocess_fn(0, args)

    print('Done!')


Are you using any sparse tensors in your dataset? According to your stack trace the error is getting hit in the forward pass not the code under torch.no_grad(). I tried running your example code that does not work (with slight modifications to argparse and loss fn) but it is working for me. Do you have another smaller example to demonstrate?

import os

import torch
import torch.nn as nn
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import Adam

import torchvision.utils as vutil
import argparse


# ====== https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html ======

# Root directory for dataset
dataroot = "data/celeba"

# Number of workers for dataloader
workers = 2

# Batch size during training
batch_size = 128

# Spatial size of training images. All images will be resized to this
#   size using a transformer.
image_size = 64

# Number of channels in the training images. For color images this is 3
nc = 3

# Size of z latent vector (i.e. size of generator input)
nz = 100

# Size of feature maps in generator
ngf = 64

# Size of feature maps in discriminator
ndf = 64

# Number of training epochs
num_epochs = 5

# Learning rate for optimizers
lr = 0.0002

# Beta1 hyperparam for Adam optimizers
beta1 = 0.5

# Number of GPUs available. Use 0 for CPU mode.
ngpu = 1

class Generator(nn.Module):
    def __init__(self, ngpu):
        super(Generator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
            nn.Tanh()
            # state size. (nc) x 64 x 64
        )

    def forward(self, input):
        return self.main(input)

class Discriminator(nn.Module):
    def __init__(self, ngpu):
        super(Discriminator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is (nc) x 64 x 64
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 32 x 32
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 16 x 16
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 8 x 8
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*8) x 4 x 4
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )

    def forward(self, input):
        return self.main(input)

# =======================================


def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '7777'
    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)


def cleanup():
    dist.destroy_process_group()


def subprocess_fn(rank, args):
    setup(rank, args.num_gpus)
    print(f'running on rank {rank}')
    generator = Generator(args.num_gpus).to(rank)
    discriminator = Discriminator(args.num_gpus).to(rank)

    if args.distributed:
        generator = DDP(generator, device_ids=[rank], broadcast_buffers=False)
        discriminator = DDP(discriminator, device_ids=[rank], broadcast_buffers=False)
    d_optim = Adam(discriminator.parameters(), lr=2e-4)
    g_optim = Adam(generator.parameters(), lr=2e-4)

    discriminator.train()
    generator.train()
    if rank == 0:
        fixed_z = torch.randn(64, 100, 1, 1).to(rank)

    pbar = range(args.iter)
    for e in pbar:
        real_data = torch.randn((args.batchsize, 3, 64, 64)).to(rank)
        real_pred = discriminator(real_data)

        latent = torch.randn((args.batchsize, 100, 1, 1)).to(rank)
        fake_data = generator(latent)
        fake_pred = discriminator(fake_data)

        d_loss = real_pred - fake_pred

        d_optim.zero_grad()
        d_loss.backward()
        d_optim.step()

        if rank == 0 and e % 100 == 0:
            print(f'Epoch D loss:{d_loss.item()};')
            with torch.no_grad():
                imgs = generator(fixed_z)
                vutil.save_image(imgs, f'str(e).zfill(5).png', normalize=True)

    cleanup()
    print(f'Process {rank} exits...')

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    args = parser.parse_args()
    args.num_gpus = 2
    args.distributed = args.num_gpus > 1

    if args.distributed:
        mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
    else:
        subprocess_fn(0, args)

    print('Done!')

Hi Howard, thank you for replying. I ran your code and got the same error (I moved save_image line and modified your loss to be real_pred.mean()- fake_pred.mean() to make the loss scalar). Here is the error I got. BTW, I don’t think this issue is related to my dataset cause I don’t use any dataset here. The ‘data’ is just random number generated by torch.randn().

running on rank 1
running on rank 0
Epoch D loss:-0.025493651628494263;
Traceback (most recent call last):
  File "debug2.py", line 178, in <module>
    mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
  File "/home/azav/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/home/azav/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
    while not context.join():
  File "/home/azav/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 150, in join
    raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException: 

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/home/azav/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
    fn(i, *args)
  File "/home/azav/Documents/LargeGANs/debug2.py", line 147, in subprocess_fn
    real_pred = discriminator(real_data)
  File "/home/azav/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/azav/.local/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 692, in forward
    if self.reducer._rebuild_buckets():
RuntimeError: Buckets with more than one variable cannot include variables that expect a sparse gradient.

The code I ran is:

import os

import torch
import torch.nn as nn
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import Adam

import torchvision.utils as vutil
import argparse


# Number of channels in the training images. For color images this is 3
nc = 3

# Size of z latent vector (i.e. size of generator input)
nz = 100

# Size of feature maps in generator
ngf = 64

# Size of feature maps in discriminator
ndf = 64

# Number of training epochs
num_epochs = 5

# Learning rate for optimizers
lr = 0.0002

# Beta1 hyperparam for Adam optimizers
beta1 = 0.5

# Number of GPUs available. Use 0 for CPU mode.
ngpu = 1


class Generator(nn.Module):
    def __init__(self, ngpu):
        super(Generator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
            nn.Tanh()
            # state size. (nc) x 64 x 64
        )

    def forward(self, input):
        return self.main(input)


class Discriminator(nn.Module):
    def __init__(self, ngpu):
        super(Discriminator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is (nc) x 64 x 64
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 32 x 32
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 16 x 16
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 8 x 8
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*8) x 4 x 4
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )

    def forward(self, input):
        return self.main(input)

# =======================================


def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '7777'
    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)


def cleanup():
    dist.destroy_process_group()


def subprocess_fn(rank, args):
    setup(rank, args.num_gpus)
    print(f'running on rank {rank}')
    generator = Generator(args.num_gpus).to(rank)
    discriminator = Discriminator(args.num_gpus).to(rank)

    if args.distributed:
        generator = DDP(generator, device_ids=[rank], broadcast_buffers=False)
        discriminator = DDP(discriminator, device_ids=[rank], broadcast_buffers=False)
    d_optim = Adam(discriminator.parameters(), lr=2e-4)
    g_optim = Adam(generator.parameters(), lr=2e-4)

    discriminator.train()
    generator.train()
    if rank == 0:
        fixed_z = torch.randn(64, 100, 1, 1).to(rank)

    pbar = range(args.iter)
    for e in pbar:
        real_data = torch.randn((args.batchsize, 3, 64, 64)).to(rank)
        real_pred = discriminator(real_data)

        latent = torch.randn((args.batchsize, 100, 1, 1)).to(rank)
        fake_data = generator(latent)
        fake_pred = discriminator(fake_data)

        d_loss = real_pred.mean() - fake_pred.mean()

        d_optim.zero_grad()
        d_loss.backward()
        d_optim.step()

        if rank == 0 and e % 2 == 0:
            print(f'Epoch D loss:{d_loss.item()};')
            with torch.no_grad():
                imgs = generator(fixed_z)

    cleanup()
    print(f'Process {rank} exits...')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_gpus', type=int, help='Number of GPUs', default=2)
    parser.add_argument('--batchsize', type=int, help='Batch size for each GPU', default=2)
    parser.add_argument('--iter', type=int, help='Total training iterations', default=10)
    args = parser.parse_args()
    args.distributed = args.num_gpus > 1

    if args.distributed:
        mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
    else:
        subprocess_fn(0, args)

    print('Done!')

I think it’s related to code block inside torch.no_grad() even though stack trace said error is hit in the forward pass. The environment I’m using is ‘1.8.1+cu101’.

Following-up information on previous code by Howard:
If I change the save_img line to be vutil.save_image(imgs, '%e.png' % e, normalize=True), the error changes to

running on rank 0
running on rank 1
Epoch D loss:0.07447701692581177;
Traceback (most recent call last):
  File "debug2.py", line 178, in <module>
    mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
  File "/home/azav/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/home/azav/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
    while not context.join():
  File "/home/azav/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 150, in join
    raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException: 

-- Process 1 terminated with the following error:
Traceback (most recent call last):
  File "/home/azav/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
    fn(i, *args)
  File "/home/azav/Documents/LargeGANs/debug2.py", line 147, in subprocess_fn
    real_pred = discriminator(real_data)
  File "/home/azav/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/azav/.local/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 692, in forward
    if self.reducer._rebuild_buckets():
RuntimeError: Out of range variable index specified.

Hey @Hongkai_Zheng

I tried the code below, but couldn’t reproduce the error either. Which version of PyTorch are you using? I am on 1.10.0a0+git9730d91.

BTW, I also checked that there is no sparse gradients after the backward pass.

import os

import torch
import torch.nn as nn
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import Adam

import torchvision.utils as vutil
import argparse


# Number of channels in the training images. For color images this is 3
nc = 3

# Size of z latent vector (i.e. size of generator input)
nz = 100

# Size of feature maps in generator
ngf = 64

# Size of feature maps in discriminator
ndf = 64

# Number of training epochs
num_epochs = 5

# Learning rate for optimizers
lr = 0.0002

# Beta1 hyperparam for Adam optimizers
beta1 = 0.5

# Number of GPUs available. Use 0 for CPU mode.
ngpu = 1


class Generator(nn.Module):
    def __init__(self, ngpu):
        super(Generator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
            nn.Tanh()
            # state size. (nc) x 64 x 64
        )

    def forward(self, input):
        return self.main(input)


class Discriminator(nn.Module):
    def __init__(self, ngpu):
        super(Discriminator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is (nc) x 64 x 64
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 32 x 32
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 16 x 16
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 8 x 8
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*8) x 4 x 4
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )

    def forward(self, input):
        return self.main(input)

# =======================================


def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '7777'
    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)


def cleanup():
    dist.destroy_process_group()


def subprocess_fn(rank, args):
    setup(rank, args.num_gpus)
    print(f'running on rank {rank}')
    generator = Generator(args.num_gpus).to(rank)
    discriminator = Discriminator(args.num_gpus).to(rank)

    if args.distributed:
        generator = DDP(generator, device_ids=[rank], broadcast_buffers=False)
        discriminator = DDP(discriminator, device_ids=[rank], broadcast_buffers=False)
    d_optim = Adam(discriminator.parameters(), lr=2e-4)
    g_optim = Adam(generator.parameters(), lr=2e-4)

    discriminator.train()
    generator.train()
    if rank == 0:
        fixed_z = torch.randn(64, 100, 1, 1).to(rank)

    pbar = range(args.iter)
    for e in pbar:
        real_data = torch.randn((args.batchsize, 3, 64, 64)).to(rank)
        real_pred = discriminator(real_data)

        latent = torch.randn((args.batchsize, 100, 1, 1)).to(rank)
        fake_data = generator(latent)
        fake_pred = discriminator(fake_data)

        d_loss = real_pred.mean() - fake_pred.mean()

        d_optim.zero_grad()
        d_loss.backward()
        d_optim.step()

        if rank == 0 and e % 2 == 0:
            print(f'Epoch D loss:{d_loss.item()};')
            with torch.no_grad():
                imgs = generator(fixed_z)

    cleanup()
    print(f'Process {rank} exits...')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_gpus', type=int, help='Number of GPUs', default=2)
    parser.add_argument('--batchsize', type=int, help='Batch size for each GPU', default=2)
    parser.add_argument('--iter', type=int, help='Total training iterations', default=10)
    args = parser.parse_args()
    args.distributed = args.num_gpus > 1

    if args.distributed:
        mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
    else:
        subprocess_fn(0, args)

    print('Done!')

Hi @mrshenli , thank you for replying. I’m using Pytorch 1.8.1 cuda 101. Can you also try the following code in your environment (for which I got RuntimeError: Buckets with more than one variable cannot include variables that expect a sparse gradient. )? :thinking: Is this an issue related to pytorch or cuda version?

import os

import torch
import torch.nn as nn
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import Adam

import torchvision.utils as vutil
import argparse


# ====== https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html ======

# Root directory for dataset
dataroot = "data/celeba"

# Number of workers for dataloader
workers = 2

# Batch size during training
batch_size = 128

# Spatial size of training images. All images will be resized to this
#   size using a transformer.
image_size = 64

# Number of channels in the training images. For color images this is 3
nc = 3

# Size of z latent vector (i.e. size of generator input)
nz = 100

# Size of feature maps in generator
ngf = 64

# Size of feature maps in discriminator
ndf = 64

# Number of training epochs
num_epochs = 5

# Learning rate for optimizers
lr = 0.0002

# Beta1 hyperparam for Adam optimizers
beta1 = 0.5

# Number of GPUs available. Use 0 for CPU mode.
ngpu = 1


class Generator(nn.Module):
    def __init__(self, ngpu):
        super(Generator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # state size. (ngf) x 32 x 32
            nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
            nn.Tanh()
            # state size. (nc) x 64 x 64
        )

    def forward(self, input):
        return self.main(input)


class Discriminator(nn.Module):
    def __init__(self, ngpu):
        super(Discriminator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # input is (nc) x 64 x 64
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 32 x 32
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 16 x 16
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 8 x 8
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*8) x 4 x 4
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )

    def forward(self, input):
        return self.main(input)

# =======================================


def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '7777'
    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)


def cleanup():
    dist.destroy_process_group()


def subprocess_fn(rank, args):
    setup(rank, args.num_gpus)
    print(f'running on rank {rank}')
    generator = Generator(args.num_gpus).to(rank)
    discriminator = Discriminator(args.num_gpus).to(rank)

    if args.distributed:
        generator = DDP(generator, device_ids=[rank], broadcast_buffers=False)
        discriminator = DDP(discriminator, device_ids=[rank], broadcast_buffers=False)
    d_optim = Adam(discriminator.parameters(), lr=2e-4)
    g_optim = Adam(generator.parameters(), lr=2e-4)

    discriminator.train()
    generator.train()
    if rank == 0:
        fixed_z = torch.randn(64, 100, 1, 1).to(rank)

    pbar = range(args.iter)
    for e in pbar:
        real_data = torch.randn((args.batchsize, 3, 64, 64)).to(rank)
        real_pred = discriminator(real_data)

        latent = torch.randn((args.batchsize, 100, 1, 1)).to(rank)
        fake_data = generator(latent)
        fake_pred = discriminator(fake_data)

        d_loss = real_pred.mean() - fake_pred.mean()

        d_optim.zero_grad()
        d_loss.backward()
        d_optim.step()

        if rank == 0 and e % 2 == 0:
            print(f'Epoch D loss:{d_loss.item()};')
            with torch.no_grad():
                imgs = generator(fixed_z)
                vutil.save_image(imgs, f'{e}.png', normalize=True)

    cleanup()
    print(f'Process {rank} exits...')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_gpus', type=int, help='Number of GPUs', default=2)
    parser.add_argument('--batchsize', type=int, help='Batch size for each GPU', default=2)
    parser.add_argument('--iter', type=int, help='Total training iterations', default=10)
    args = parser.parse_args()
    args.distributed = args.num_gpus > 1

    if args.distributed:
        mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
    else:
        subprocess_fn(0, args)

    print('Done!')