How did solves your problem?
Sometimes it may cause Permission Denied. Just change the RANDOM to a different int, then works for me!
I got same error after same trial. How did you solve it after removing ‘world_size’ and ‘rank’ parameter in dist.init_process_group()??
why does your solution work? my code now complained with this:
$ python playground/multiprocessing_playground/mnist-distributed.py
# gpus 2
Start running DDP with model parallel example on rank: 0.
current process: <SpawnProcess name='SpawnProcess-1' parent=1863 started>
pid: 1890
Start running DDP with model parallel example on rank: 1.
current process: <SpawnProcess name='SpawnProcess-2' parent=1863 started>
pid: 1892
Traceback (most recent call last):
File "playground/multiprocessing_playground/mnist-distributed.py", line 115, in <module>
main()
File "playground/multiprocessing_playground/mnist-distributed.py", line 30, in main
mp.spawn(train, nprocs=args.gpus, args=(args,))
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/home/miranda9/ML4Coq/playground/multiprocessing_playground/mnist-distributed.py", line 64, in train
dist.init_process_group(backend='nccl', init_method='env://')
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 423, in init_process_group
store, rank, world_size = next(rendezvous_iterator)
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/distributed/rendezvous.py", line 155, in _env_rendezvous_handler
raise _env_error("RANK")
ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set
self contained code:
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
# from apex.parallel import DistributedDataParallel as DDP
# from apex import amp
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=2, type=int, metavar='N',
help='number of total epochs to run')
args = parser.parse_args()
args.gpus = torch.cuda.device_count()
args.world_size = args.gpus * args.nodes
os.environ['MASTER_ADDR'] = '10.57.23.164'
# os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
mp.spawn(train, nprocs=args.gpus, args=(args,))
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(7*7*32, num_classes)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.fc(out)
return out
def train(gpu, args):
print()
print(f"Start running DDP with model parallel example on rank: {gpu}.")
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
rank = args.nr * args.gpus + gpu
# dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
# dist.init_process_group(backend='nccl', init_method='env://')
torch.manual_seed(0)
model = ConvNet()
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size = 100
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Wrap the model
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
# Data loading code
train_dataset = torchvision.datasets.MNIST(root='./data',
train=True,
transform=transforms.ToTensor(),
download=True)
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
num_replicas=args.world_size,
rank=rank)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0,
pin_memory=True,
sampler=train_sampler)
start = datetime.now()
total_step = len(train_loader)
for epoch in range(args.epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step,
loss.item()))
if gpu == 0:
print("Training complete in: " + str(datetime.now() - start))
dist.destroy_process_group()
if __name__ == '__main__':
print(f'# gpus {torch.cuda.device_count()}')
main()
print('Done!\n\a')
Your suggestion worked for me but the random port was not working some times. So, I generated available port number by python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()'
I think this might also be useful to find free ports (better than random):
def find_free_port():
""" https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number """
import socket
from contextlib import closing
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
return str(s.getsockname()[1])
then do (is my guess)
init_method = f"localhost:{find_free_port()}"
@smth How do I get the flag within my python script that I am passing to torchrun
? I want to set the number --nproc_per_node=32
I am passing there automatically rather than making sure the two scripts match (note I want to set the world size myself e.g. I am using cpu parallel jobs and want to choose that value myself thus)
Hi, I am getting the same error, but when I removed the world_size and rank, I got this error.
ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set