Try to use Docker Cluster without GPU to run distributed training,but connect refused

  1. 2 nodes ,1 container/node

  2. only cpu

  3. code run in container

  4. connect by tcp

  5. docker run -it  run --rm -it --ipc=host --network=host xxx
    
  6. python mnist.py --init-method tcp://ip:port --rank 0 --world-size 2
    python mnist.py --init-method tcp://ip:port --rank 1 --world-size 2
    

my code is here

from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import time

import torch.nn.parallel
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.utils.data
import torch.utils.data.distributed
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=1024, metavar='N',
                    help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                    help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=20, metavar='N',
                    help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                    help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                    help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_false', default=False,
                    help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')
parser.add_argument('--init-method', type=str, default='tcp://127.0.0.1:23456')
parser.add_argument('--rank', type=int)
parser.add_argument('--world-size',type=int)

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()


dist.init_process_group(init_method=args.init_method,backend="gloo",world_size=args.world_size,rank=args.rank,group_name="pytorch_test")

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

train_dataset=datasets.MNIST('data', train=True, download=True,
               transform=transforms.Compose([
                   transforms.ToTensor(),
                   transforms.Normalize((0.1307,), (0.3081,))
               ]))

train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}

train_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=args.batch_size, shuffle=True, **kwargs,sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args.test_batch_size, shuffle=True, **kwargs)


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x)

model = Net()

model = torch.nn.parallel.DistributedDataParallelCPU(model)
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test():
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += F.nll_loss(output, target, size_average=False).item() # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

tot_time=0;

for epoch in range(1, args.epochs + 1):

    train_sampler.set_epoch(epoch)
    start_cpu_secs = time.time()
    #long running
    train(epoch)
    end_cpu_secs = time.time()
    print("Epoch {} of {} took {:.3f}s".format(
        epoch , args.epochs , end_cpu_secs - start_cpu_secs))
    tot_time+=end_cpu_secs - start_cpu_secs
    test()

print("Total time= {:.3f}s".format(tot_time))

and then i got problem


  File "mnsit.py", line 43, in <module>
    dist.init_process_group(init_method=args.init_method,backend="gloo",world_size=args.world_size,rank=args.rank,group_name="pytorch_test")
  File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 416, in init_process_group
    timeout=timeout)
  File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 484, in _new_process_group_helper
    timeout=timeout)
RuntimeError: [/pytorch/third_party/gloo/gloo/transport/tcp/pair.cc:760] connect [127.0.1.1]:10129: Connection refused
root@pcl2-2288H-V5:/workspace/recommendation# python mnsit.py --init-method tcp://10.10.16.62:45795 --rank 0 --world-size 2
Traceback (most recent call last):
  File "mnsit.py", line 43, in <module>
    dist.init_process_group(init_method=args.init_method,backend="gloo",world_size=args.world_size,rank=args.rank,group_name="pytorch_test")
  File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 416, in init_process_group
    timeout=timeout)
  File "/opt/conda/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 484, in _new_process_group_helper
    timeout=timeout)
RuntimeError: [/pytorch/third_party/gloo/gloo/transport/tcp/pair.cc:760] connect [127.0.1.1]:39850: Connection refused

The Gloo backend tries to resolve each process’ IP address by looking at the host name. This likely resolves to the loopback address for you, looking at the error message.

You can set GLOO_SOCKET_IFNAME to the network interface name you want to use for communication and it will resolve the right IP address.

Also see the torch.distributed documentation.

Thanks,it fixed,too many network interface the machine have,i tried all of then.