Hi, I’m trying to start 2 training tasks on a single node and I want each task to occupy 2 GPUs respectively. Everything is fine with task 1. However, when I try to launch task 2, I encounter the following error:
Traceback (most recent call last):
File "dist_test.py", line 93, in <module>
train()
File "dist_test.py", line 62, in train
init_method='env://',
File "/data0/whr/anaconda3/envs/torch1.0/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 354, in init_process_group
store, rank, world_size = next(rendezvous(url))
File "/data0/whr/anaconda3/envs/torch1.0/lib/python3.6/site-packages/torch/distributed/rendezvous.py", line 143, in _env_rendezvous_handler
store = TCPStore(master_addr, master_port, start_daemon)
RuntimeError: Address already in use
Traceback (most recent call last):
File "dist_test.py", line 93, in <module>
train()
File "dist_test.py", line 69, in train
output_device = args.local_rank
File "/data0/whr/anaconda3/envs/torch1.0/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 215, in __init__
self.broadcast_bucket_size)
File "/data0/whr/anaconda3/envs/torch1.0/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 377, in _dist_broadcast_coalesced
dist._dist_broadcast_coalesced(self.process_group, tensors, buffer_size, False)
RuntimeError: NCCL error in: /opt/conda/conda-bld/pytorch_1544081127912/work/torch/lib/c10d/ProcessGroupNCCL.cpp:260, unhandled system error
Because I want to use sync batch norm which is only supported with DistributedDataParallel, so unfortunatelly I cannot switch to DataParallel. It’ll be best if somebody could tell me whether it’s possible to run 2 DistributedDataParallel tasks on a single node at the same time.
Here is an example code snippet for reproducing this problem:
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import argparse
import os
def parse_args():
parse = argparse.ArgumentParser()
parse.add_argument(
'--local_rank',
dest = 'local_rank',
type = int,
default = 0,
)
parse.add_argument("--gpu", type=str, default='None',
help="choose gpu device.")
return parse.parse_args()
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3,
64,
kernel_size = 3,
stride = 2,
padding = 1,)
self.conv2 = nn.Conv2d(64,
256,
kernel_size = 3,
stride = 2,
padding = 1,)
self.conv3 = nn.Conv2d(256,
512,
kernel_size = 3,
stride = 2,
padding = 1,)
#self.linear = nn.Linear(512, 10)
def forward(self, x):
H, W = x.size()[2:]
x = self.conv1(x)
x = self.conv2(x)
logits = self.conv3(x)
logits = F.interpolate(logits, (H, W), mode='bilinear')
return logits
def train():
args = parse_args()
if not args.gpu == 'None':
device = torch.device("cuda")
os.environ["CUDA_VISIBLE_DEVICES"]=args.gpu
else:
device = torch.device("cpu")
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(
backend='nccl',
init_method='env://',
)
net = Net()
net = net.to(device)
net = nn.parallel.DistributedDataParallel(net,
device_ids = [args.local_rank, ],
output_device = args.local_rank
)
net.train()
optim = torch.optim.SGD(
net.parameters(),
lr = 1e-3,
momentum = 0.9,
weight_decay = 5e-4)
criteria = nn.CrossEntropyLoss()
for i in range(10000):
img = torch.randn(2, 3, 128, 128).cuda()
lb = torch.randint(0, 19, [2,128,128]).cuda()
optim.zero_grad()
out = net(img)
loss = criteria(out, lb)
loss.backward()
loss_val = loss.item()
optim.step()
print(loss_val)
if __name__ == "__main__":
train()
I run the following command for task 1 :
python -m torch.distributed.launch --nproc_per_node 2 dist_test.py --gpu 0,1
and the command for task 2:
python -m torch.distributed.launch --nproc_per_node 2 dist_test.py --gpu 2,3
Thanks in advance!