DataParallel and DistributedDataParallel are working with no runtime errors, and network is loaded to the correct GPUs, but then the GPU usage is at 100% forever ( I tried waiting an hour max).
GPU: RTX 8000 (50GB of Memory) and no the memory is not full.
I’m pretty sure the code isn’t the issue since I downloaded different sample codes and they all cause the same issue. This is one of the codes I’ve tried, I tried each case, the one without any distributed training is the one that worked, both DistributedDataParallel and DataParallel both have the same issue described above.
System:
uname -a
: Linux lqfaris 5.4.0-58-generic #64~18.04.1-Ubuntu SMP Wed Dec 9 17:11:11 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
# pytorch installed through:
conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch
$ conda list|grep torch
pytorch 1.9.0 py3.8_cuda10.2_cudnn7.6.5_0 pytorch
torchaudio 0.9.0 py38 pytorch
torchvision 0.10.0 py38_cu102 pytorch
$ pip list|grep torch
torch 1.9.0
torchaudio 0.9.0a0+33b2469
torchvision 0.10.0
Code to reproduce:
import torch.nn as nn
import torch
import time
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(2048, 1024)
def forward(self, x):
x = self.fc1(x)
return x
net = Net()
net = net.cuda()
net = nn.DataParallel(net, device_ids=[0, 1])
net.train()
x = torch.randn(1 * 3 * 4 * 8, 2048).cuda()
for _ in range(10):
tis = time.time()
x = x.cuda()
print('net(x)')
net(x) # <------ stuck here
print(time.time() - tis)
Example for DistributedDataParallel
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
from tqdm.auto import tqdm
def find_free_port():
import socket
from contextlib import closing
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
return s.getsockname()[1]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=2, type=int, metavar='N',
help='number of total epochs to run')
args = parser.parse_args()
args.world_size = args.gpus * args.nodes
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = str(find_free_port())
mp.spawn(train, nprocs=args.gpus, args=(args,))
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7 * 7 * 32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
def train(gpu, args):
rank = args.nr * args.gpus + gpu
dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
torch.manual_seed(0)
model = ConvNet()
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size = 16 * args.world_size
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Wrap the model
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
# Data loading code
train_dataset = torchvision.datasets.MNIST(root='/data',
train=True,
transform=transforms.ToTensor(),
download=True)
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
num_replicas=args.world_size,
rank=rank)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0,
pin_memory=True,
sampler=train_sampler)
start = datetime.now()
total_step = len(train_loader)
for epoch in range(args.epochs):
for i, (images, labels) in tqdm(enumerate(train_loader)):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step,
loss.item()))
if gpu == 0:
print("Training complete in: " + str(datetime.now() - start))
if __name__ == '__main__':
main()