I am working on DistributedDataParallel, trying to speed up training process. However, after two epochs, the distributed did not perform as well as the normal.
the log of distributed verision:
Epoch [1/2], Step [100/150], Loss: 2.1133
Epoch [2/2], Step [100/150], Loss: 1.9204
Training complete in: 0:00:27.426653
Dev loss: 1.8674346208572388
the log of normal version
Epoch [1/2], Step [100/600], Loss: 2.1626
Epoch [1/2], Step [200/600], Loss: 1.9929
Epoch [1/2], Step [300/600], Loss: 1.9224
Epoch [1/2], Step [400/600], Loss: 1.7479
Epoch [1/2], Step [500/600], Loss: 1.6264
Epoch [1/2], Step [600/600], Loss: 1.5411
Epoch [2/2], Step [100/600], Loss: 1.4387
Epoch [2/2], Step [200/600], Loss: 1.3243
Epoch [2/2], Step [300/600], Loss: 1.2894
Epoch [2/2], Step [400/600], Loss: 1.1754
Epoch [2/2], Step [500/600], Loss: 1.1271
Epoch [2/2], Step [600/600], Loss: 1.1246
Training complete in: 0:00:53.779830
Dev loss: 1.1193695068359375
the source code
distributed version
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7*7*32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
def train(gpu, args):
rank = args.nr * args.gpus + gpu
dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
torch.manual_seed(0)
model = ConvNet()
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size = 100
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Wrap the model
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
# Data loading code
train_dataset = torchvision.datasets.MNIST(root='./data',
train=True,
transform=transforms.ToTensor(),
download=True)
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
num_replicas=args.world_size,
rank=rank)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0,
pin_memory=True,
sampler=train_sampler)
start = datetime.now()
total_step = len(train_loader)
for epoch in range(args.epochs):
train_sampler.set_epoch(epoch)
for i, (images, labels) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step,
loss.item()))
if gpu == 0:
print("Training complete in: " + str(datetime.now() - start))
dev_dataset = torchvision.datasets.MNIST(root='./data',
train=False,
transform=transforms.ToTensor(),
download=False)
dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=0,
pin_memory=True)
_ = model.eval()
with torch.no_grad():
y_hat = []
y = []
for i, (images, labels) in enumerate(dev_loader):
y.append(labels.cuda(non_blocking=True))
y_hat.append(model(images.cuda(non_blocking=True)))
y_hat = torch.cat(y_hat)
y = torch.cat(y)
loss = criterion(y_hat, y)
print(f'Dev loss: {loss.item()}')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=2, type=int, metavar='N',
help='number of total epochs to run')
args = parser.parse_args()
###################################################
args.world_size = args.gpus * args.nodes #
os.environ['MASTER_ADDR'] = HOST #
os.environ['MASTER_PORT'] = PORT #
mp.spawn(train, nprocs=args.gpus, args=(args, )) #
###################################################
if __name__ == '__main__':
"""
Epoch [1/2], Step [100/150], Loss: 2.1133
Epoch [2/2], Step [100/150], Loss: 1.9204
Training complete in: 0:00:27.426653
Dev loss: 1.8674346208572388
"""
main()
the single gpu version
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7*7*32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
def train(gpu, args):
torch.manual_seed(0)
model = ConvNet()
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size = 100
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Data loading code
train_dataset = torchvision.datasets.MNIST(root='./data',
train=True,
transform=transforms.ToTensor(),
download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=0,
pin_memory=True)
start = datetime.now()
total_step = len(train_loader)
for epoch in range(args.epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
epoch + 1,
args.epochs,
i + 1,
total_step,
loss.item())
)
if gpu == 0:
print("Training complete in: " + str(datetime.now() - start))
dev_dataset = torchvision.datasets.MNIST(root='./data',
train=False,
transform=transforms.ToTensor(),
download=False)
dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=0,
pin_memory=True)
_ = model.eval()
with torch.no_grad():
y_hat = []
y = []
for i, (images, labels) in enumerate(dev_loader):
y.append(labels.cuda(non_blocking=True))
y_hat.append(model(images.cuda(non_blocking=True)))
y_hat = torch.cat(y_hat)
y = torch.cat(y)
loss = criterion(y_hat, y)
print(f'Dev loss: {loss.item()}')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=2, type=int, metavar='N',
help='number of total epochs to run')
args = parser.parse_args()
train(0, args)
if __name__ == '__main__':
"""
Epoch [1/2], Step [100/600], Loss: 2.1626
Epoch [1/2], Step [200/600], Loss: 1.9929
Epoch [1/2], Step [300/600], Loss: 1.9224
Epoch [1/2], Step [400/600], Loss: 1.7479
Epoch [1/2], Step [500/600], Loss: 1.6264
Epoch [1/2], Step [600/600], Loss: 1.5411
Epoch [2/2], Step [100/600], Loss: 1.4387
Epoch [2/2], Step [200/600], Loss: 1.3243
Epoch [2/2], Step [300/600], Loss: 1.2894
Epoch [2/2], Step [400/600], Loss: 1.1754
Epoch [2/2], Step [500/600], Loss: 1.1271
Epoch [2/2], Step [600/600], Loss: 1.1246
Training complete in: 0:00:53.779830
Dev loss: 1.1193695068359375
"""
main()
any help is appreciated