Hi everyone, I’m using nn.DataParallel to do multi GPU training. But it’s strange that I didn’t get any speedup on forward nor backward via DataParallel. Is there anything I miss when using DataParallel?
My code for profiling forward is as follows
import torch
import torch.nn as nn
import time
DIM = 128
class Generator(nn.Module):
def __init__(self):
super(Generator, self).__init__()
self.preprocess = nn.Sequential(
nn.Linear(128, 4 * 4 * 4 * DIM),
nn.BatchNorm1d(4 * 4 * 4 * DIM),
nn.ReLU(True),
)
self.main_module = nn.Sequential(
nn.ConvTranspose2d(
4 * DIM, 2 * DIM, kernel_size=4, stride=2, padding=1),
nn.BatchNorm2d(2 * DIM),
nn.ReLU(True),
nn.ConvTranspose2d(2 * DIM, DIM, kernel_size=4,
stride=2, padding=1),
nn.BatchNorm2d(DIM),
nn.ReLU(True),
nn.ConvTranspose2d(DIM, 3, kernel_size=4, stride=2, padding=1),
nn.Tanh(),
)
def forward(self, input):
output = self.preprocess(input)
output = output.view(-1, 4 * DIM, 4, 4)
output = self.main_module(output)
return output.view(-1, 3, 32, 32)
class Discriminator(nn.Module):
def __init__(self):
super(Discriminator, self).__init__()
self.main_module = nn.Sequential(
nn.Conv2d(3, DIM, kernel_size=4, stride=2, padding=1),
nn.LeakyReLU(),
# 16x16
nn.Conv2d(DIM, 2 * DIM, kernel_size=4, stride=2, padding=1),
nn.LeakyReLU(),
# 8x8
nn.Conv2d(2 * DIM, 4 * DIM, kernel_size=4, stride=2, padding=1),
nn.LeakyReLU(),
# 4 x 4
)
self.linear = nn.Linear(4 * 4 * 4 * DIM, 1)
def forward(self, input):
output = self.main_module(input)
output = output.view(-1, 4 * 4 * 4 * DIM)
output = self.linear(output)
return output
if __name__ == "__main__":
torch.backends.cudnn.benchmark = True
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
print(torch.__version__)
batch_size = 256
print('====Single GPU test====')
D = Discriminator().to(device)
G = Generator().to(device)
data = (torch.rand((batch_size, 3, 32, 32), device=device) - 0.5) / 0.5
z = torch.randn((batch_size, 128), device=device)
for i in range(2):
torch.cuda.synchronize()
start = time.time()
loss = D(data) - D(G(z))
torch.cuda.synchronize()
end = time.time()
if i != 0:# skip reporting for the first iteration because of cudnn.benchmark
print('Iter: %d; Forward time cost: %.6fs' % (i, end - start))
print('====Two GPUs test====')
D2 = Discriminator().to(device)
G2 = Generator().to(device)
D2 = nn.DataParallel(D2, list(range(2)))
G2 = nn.DataParallel(G2, list(range(2)))
for i in range(2):
torch.cuda.synchronize()
start = time.time()
loss = D(data) - D(G(z))
torch.cuda.synchronize()
end = time.time()
if i != 0: # skip reporting for the first iteration because of cudnn.benchmark
print('Iter: %d; Forward time cost: %.6fs' % (i, end - start))
Output is
cuda:0
1.7.0+cu101
====Single GPU test====
Iter: 1; Forward time cost: 0.013395s
====Two GPUs test====
Iter: 1; Forward time cost: 0.012617s
DataParallel doesn’t seem to speed up the forward pass.