How to use DataParallel in backward?

I am fresh here…I want to use 8 gpu for DataParallel in both forward(success) and backward(failed).I don’t know why. If I only use one GPU for backward like that( 72.criterion = nn.CrossEntropyLoss().cuda()), it can work. However ,I want to play with 8 gpu.Here is my code.

  1. from future import print_function

  2. import argparse

  3. import torch

  4. import torch.nn as nn

  5. import torch.nn.functional as F

  6. import torch.optim as optim

  7. from torchvision import datasets, transforms

  8. from torch.autograd import Variable

  9. Training settings

  10. parser = argparse.ArgumentParser(description=‘PyTorch MNIST Example’)

  11. parser.add_argument(’–batch-size’, type=int, default=64, metavar=‘N’,

  12.                 help='input batch size for training (default: 64)')
    
  13. parser.add_argument(’–test-batch-size’, type=int, default=1000, metavar=‘N’,

  14.                 help='input batch size for testing (default: 1000)')
    
  15. parser.add_argument(’–epochs’, type=int, default=10, metavar=‘N’,

  16.                 help='number of epochs to train (default: 10)')
    
  17. parser.add_argument(’–lr’, type=float, default=0.01, metavar=‘LR’,

  18.                 help='learning rate (default: 0.01)')
    
  19. parser.add_argument(’–momentum’, type=float, default=0.5, metavar=‘M’,

  20.                 help='SGD momentum (default: 0.5)')
    
  21. parser.add_argument(’–no-cuda’, action=‘store_true’, default=False,

  22.                 help='disables CUDA training')
    
  23. parser.add_argument(’–seed’, type=int, default=1, metavar=‘S’,

  24.                 help='random seed (default: 1)')
    
  25. parser.add_argument(’–log-interval’, type=int, default=10, metavar=‘N’,

  26.                 help='how many batches to wait before logging training status')
    
  27. args = parser.parse_args()

  28. args.cuda = not args.no_cuda

  29. torch.manual_seed(args.seed)

  30. if args.cuda:

  31. torch.cuda.manual_seed(args.seed)
    
  32. kwargs = {‘num_workers’: 1, ‘pin_memory’: False} if args.cuda else {}

  33. train_loader = torch.utils.data.DataLoader(

  34. datasets.MNIST('../data', train=True, download=True,
    
  35.                transform=transforms.Compose([
    
  36.                    transforms.ToTensor(),
    
  37.                    transforms.Normalize((0.1307,), (0.3081,))
    
  38.                ])),
    
  39. batch_size=args.batch_size, shuffle=True, **kwargs)
    
  40. test_loader = torch.utils.data.DataLoader(

  41. datasets.MNIST('../data', train=False, transform=transforms.Compose([
    
  42.                    transforms.ToTensor(),
    
  43.                    transforms.Normalize((0.1307,), (0.3081,))
    
  44.                ])),
    
  45. batch_size=args.batch_size, shuffle=True, **kwargs)
    
  46. class Net(nn.Module):

  47. def __init__(self):
    
  48.     super(Net, self).__init__()
    
  49.     self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
    
  50.     self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
    
  51.     self.conv2_drop = nn.Dropout2d()
    
  52.     self.fc1 = nn.Linear(320, 50)
    
  53.     self.fc2 = nn.Linear(50, 10)
    
  54. def forward(self, x,target):
    
  55.     x = F.relu(F.max_pool2d(self.conv1(x), 2))
    
  56.     x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
    
  57.     x = x.view(-1, self.num_flat_features(x))
    
  58.     x = F.relu(self.fc1(x))
    
  59.     x = F.dropout(x, training=self.training)
    
  60.     x = self.fc2(x)
    
  61.     output = F.log_softmax(x)
    
  62.     return output
    
  63. def num_flat_features(self, x):
    
  64.     size = x.size()[1:]  # all dimensions except the batch dimension
    
  65.     num_features = 1
    
  66.     for s in size:
    
  67.         num_features *= s
    
  68.     return num_features
    
  69. model = Net()

  70. if args.cuda:

  71. model=torch.nn.DataParallel(model, device_ids=[0,1,2,3,4,5,6,7]).cuda()
    
  72. criterion = torch.nn.DataParallel(nn.CrossEntropyLoss(), device_ids=[0,1,2,3,4,5,6,7]).cuda()
    
  73. optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

  74. def train(epoch):

  75. model.train()
    
  76. for batch_idx, (data, target) in enumerate(train_loader):
    
  77.     if args.cuda:
    
  78.         data, target = data.cuda(), target.cuda()
    
  79.     data, target = torch.autograd.Variable(data), torch.autograd.Variable(target)
    
  80.     optimizer.zero_grad()
    
  81.     output = model(data,target)
    
  82.     loss = criterion(output, target)
    
  83.     loss.backward()
    
  84.     optimizer.step()
    
  85.     if batch_idx % args.log_interval == 0:
    
  86.         correct = 0
    
  87.         pred = output.data.max(1)[1]  # get the index of the max log-probability
    
  88.         correct += pred.eq(target.data).sum()
    
  89.         print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)'.format(
    
  90.             epoch, batch_idx * len(data), len(train_loader.dataset),
    
  91.             100. * batch_idx / len(train_loader), loss.data[0],
    
  92.             correct, len(target),
    
  93.             100. * correct / len(target)))
    
  94. def test(epoch):

  95. model.eval()
    
  96. test_loss = 0
    
  97. correct = 0
    
  98. for data, target in test_loader:
    
  99.     if args.cuda:
    
  100.         data, target = data.cuda(), target.cuda()
    
  101.     data, target = Variable(data, volatile=True), Variable(target)
    
  102.     output = model(data)
    
  103.     test_loss += F.nll_loss(output, target).data[0]
    
  104.     pred = output.data.max(1)[1] # get the index of the max log-probability
    
  105.     correct += pred.eq(target.data).cpu().sum()
    
  106. test_loss = test_loss
    
  107. test_loss /= len(test_loader) # loss function already averages over batch size
    
  108. print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    
  109.     test_loss, correct, len(test_loader.dataset),
    
  110.     100. * correct / len(test_loader.dataset)))
    
  111. for epoch in range(1, args.epochs + 1):

  112. train(epoch)
    
  113. test(epoch)
1 Like

your code has screwed up formatting.

You can look at our examples (dcgan or imagenet) for correct usage of DataParallel.

Thanks for your help. I have run the examples. But it is too hard for me to understand the key step for DataParallel in backward. Could you teach me in a simple example like mnist? Here is my code.It can run , but can only realize the DataParallel in forward.

from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable


parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                    help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                    help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
                    help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                    help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                    help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')
args = parser.parse_args()
args.cuda = not args.no_cuda 

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)


kwargs = {'num_workers': 7, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args.batch_size, shuffle=True, **kwargs)


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))


        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        output = F.log_softmax(x)
        return output

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

model = Net()
if args.cuda:
    model=torch.nn.DataParallel(model, device_ids=[0,1,2,3,4,5,6,7]).cuda()
   
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = torch.autograd.Variable(data), torch.autograd.Variable(target)
        optimizer.zero_grad()
        output = model(data)
        

        loss = F.nll_loss(output, target)

        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            correct = 0


            pred = output.data.max(1)[1]  # get the index of the max log-probability
            correct += pred.eq(target.data).sum()

            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data[0],
                correct, len(target),
                100. * correct / len(target)))

def test(epoch):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)

        test_loss += F.nll_loss(output, target).data[0]
        pred = output.data.max(1)[1] # get the index of the max log-probability
        correct += pred.eq(target.data).cpu().sum()

    test_loss = test_loss
    test_loss /= len(test_loader) # loss function already averages over batch size
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


for epoch in range(1, args.epochs + 1):
    train(epoch)
    test(epoch)
1 Like

first:

  • dont double post. you’ve posted in the other thread with the same huge code-block. It’s not helpful.

second:

  • Why do you think DataParallel doesn’t work in backward? of course it does work in backward too.

oh, thanks!! Please forgive me. In fact it is my first time to post a topic in a coding forum, and I won’t make a double post again.
I print loss and find it is a scalar. I’m curious about how the Dataparallel works in backward.
Your doc say that
Data Parallelism is when we split the mini-batch of samples into multiple smaller mini-batches and run the computation for each of the smaller mini-batches in parallel.
So, I think the loss should be a tensor with shape (8*1). Each smaller mini-batches corresponds to a loss. Why there is only one scalar?

if you notice the examples, DataParallel is not applied to the entire network + loss. It is only applied to part of the network.

before adding DataParallel:
network = features (conv layers) -> classifier (linear layers)
error = loss_function(network(input), target)
error.backward()

After adding DataParallel:
network = DataParallel(features (conv layers)) -> classifier (linear layers)
error = loss_function(network(input), target)
error.backward()

1 Like

So, how does the dataparallel work in backward when I only wrap my network(without loss) with data parallel?

https://discuss.pytorch.org/t/is-the-loss-function-paralleled-when-using-dataparallel/3346/2?u=bigxiuixu
By the way ,I am follow this discuss. I have tried computing loss as part of the forward function in model too, here is the code.

def forward(self, x,target):
    x = F.relu(F.max_pool2d(self.conv1(x), 2))
    x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
    x = x.view(-1, self.num_flat_features(x))
    x = F.relu(self.fc1(x))
    x = F.dropout(x, training=self.training)
    x = self.fc2(x)
    output = F.log_softmax(x)
    return F.nll_loss(output, target),output

and wrap the network +loss with dataparallel
model=torch.nn.DataParallel(model, device_ids=[0,1,2,3,4,5,6,7]).cuda()

But finally, it say that

Traceback (most recent call last):
  File "main.py", line 135, in <module>
    train(epoch)
  File "main.py", line 98, in train
    loss.backward()
  File "/home/lab/anaconda2/lib/python2.7/site-packages/torch/autograd/variable.py", line 143, in backward
 'backward should be called only on a scalar (i.e. 1-element tensor) '
RuntimeError: backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable
1 Like

The code of dcgan is

if opt.cuda:
    netD.cuda()
    netG.cuda()
    criterion.cuda()
    input, label = input.cuda(), label.cuda()
    noise, fixed_noise = noise.cuda(), fixed_noise.cuda()

So, is the criterion use one gpu ? If it just use one gpu , how about the backward? Is it use one GPU too?