I am fresh here…I want to use 8 gpu for DataParallel in both forward(success) and backward(failed).I don’t know why. If I only use one GPU for backward like that( 72.criterion = nn.CrossEntropyLoss().cuda()), it can work. However ,I want to play with 8 gpu.Here is my code.
-
from future import print_function
-
import argparse
-
import torch
-
import torch.nn as nn
-
import torch.nn.functional as F
-
import torch.optim as optim
-
from torchvision import datasets, transforms
-
from torch.autograd import Variable
-
Training settings
-
parser = argparse.ArgumentParser(description=‘PyTorch MNIST Example’)
-
parser.add_argument(’–batch-size’, type=int, default=64, metavar=‘N’,
-
help='input batch size for training (default: 64)')
-
parser.add_argument(’–test-batch-size’, type=int, default=1000, metavar=‘N’,
-
help='input batch size for testing (default: 1000)')
-
parser.add_argument(’–epochs’, type=int, default=10, metavar=‘N’,
-
help='number of epochs to train (default: 10)')
-
parser.add_argument(’–lr’, type=float, default=0.01, metavar=‘LR’,
-
help='learning rate (default: 0.01)')
-
parser.add_argument(’–momentum’, type=float, default=0.5, metavar=‘M’,
-
help='SGD momentum (default: 0.5)')
-
parser.add_argument(’–no-cuda’, action=‘store_true’, default=False,
-
help='disables CUDA training')
-
parser.add_argument(’–seed’, type=int, default=1, metavar=‘S’,
-
help='random seed (default: 1)')
-
parser.add_argument(’–log-interval’, type=int, default=10, metavar=‘N’,
-
help='how many batches to wait before logging training status')
-
args = parser.parse_args()
-
args.cuda = not args.no_cuda
-
torch.manual_seed(args.seed)
-
if args.cuda:
-
torch.cuda.manual_seed(args.seed)
-
kwargs = {‘num_workers’: 1, ‘pin_memory’: False} if args.cuda else {}
-
train_loader = torch.utils.data.DataLoader(
-
datasets.MNIST('../data', train=True, download=True,
-
transform=transforms.Compose([
-
transforms.ToTensor(),
-
transforms.Normalize((0.1307,), (0.3081,))
-
])),
-
batch_size=args.batch_size, shuffle=True, **kwargs)
-
test_loader = torch.utils.data.DataLoader(
-
datasets.MNIST('../data', train=False, transform=transforms.Compose([
-
transforms.ToTensor(),
-
transforms.Normalize((0.1307,), (0.3081,))
-
])),
-
batch_size=args.batch_size, shuffle=True, **kwargs)
-
class Net(nn.Module):
-
def __init__(self):
-
super(Net, self).__init__()
-
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
-
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
-
self.conv2_drop = nn.Dropout2d()
-
self.fc1 = nn.Linear(320, 50)
-
self.fc2 = nn.Linear(50, 10)
-
def forward(self, x,target):
-
x = F.relu(F.max_pool2d(self.conv1(x), 2))
-
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
-
x = x.view(-1, self.num_flat_features(x))
-
x = F.relu(self.fc1(x))
-
x = F.dropout(x, training=self.training)
-
x = self.fc2(x)
-
output = F.log_softmax(x)
-
return output
-
def num_flat_features(self, x):
-
size = x.size()[1:] # all dimensions except the batch dimension
-
num_features = 1
-
for s in size:
-
num_features *= s
-
return num_features
-
model = Net()
-
if args.cuda:
-
model=torch.nn.DataParallel(model, device_ids=[0,1,2,3,4,5,6,7]).cuda()
-
criterion = torch.nn.DataParallel(nn.CrossEntropyLoss(), device_ids=[0,1,2,3,4,5,6,7]).cuda()
-
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
-
def train(epoch):
-
model.train()
-
for batch_idx, (data, target) in enumerate(train_loader):
-
if args.cuda:
-
data, target = data.cuda(), target.cuda()
-
data, target = torch.autograd.Variable(data), torch.autograd.Variable(target)
-
optimizer.zero_grad()
-
output = model(data,target)
-
loss = criterion(output, target)
-
loss.backward()
-
optimizer.step()
-
if batch_idx % args.log_interval == 0:
-
correct = 0
-
pred = output.data.max(1)[1] # get the index of the max log-probability
-
correct += pred.eq(target.data).sum()
-
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)'.format(
-
epoch, batch_idx * len(data), len(train_loader.dataset),
-
100. * batch_idx / len(train_loader), loss.data[0],
-
correct, len(target),
-
100. * correct / len(target)))
-
def test(epoch):
-
model.eval()
-
test_loss = 0
-
correct = 0
-
for data, target in test_loader:
-
if args.cuda:
-
data, target = data.cuda(), target.cuda()
-
data, target = Variable(data, volatile=True), Variable(target)
-
output = model(data)
-
test_loss += F.nll_loss(output, target).data[0]
-
pred = output.data.max(1)[1] # get the index of the max log-probability
-
correct += pred.eq(target.data).cpu().sum()
-
test_loss = test_loss
-
test_loss /= len(test_loader) # loss function already averages over batch size
-
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-
test_loss, correct, len(test_loader.dataset),
-
100. * correct / len(test_loader.dataset)))
-
for epoch in range(1, args.epochs + 1):
-
train(epoch)
-
test(epoch)