rahul003
(Rahul)
June 9, 2020, 12:54am
1
Hi,
I’m trying to use distributed autograd. But I’m running into this error.
I have initialized the process group with dist.init_process_group()
with dist_autograd.context() as context_id:
File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/distributed/autograd/__init__.py", line 33, in __enter__
self.autograd_context = _new_context()
RuntimeError: Need to initialize distributed autograd using torch.distributed.autograd.init()
But I see no init method in torch.distributed.autograd
>>> from torch.distributed import autograd
autog>>> autograd
<module 'torch.distributed.autograd' from '/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/distributed/autograd/__init__.py'>
>>> autograd.
autograd.DistAutogradContext( autograd.division autograd.sys
autograd.absolute_import autograd.get_gradients( autograd.torch
autograd.backward( autograd.is_available( autograd.unicode_literals
autograd.context( autograd.print_function
>>> autograd.
autograd.DistAutogradContext( autograd.division autograd.sys
autograd.absolute_import autograd.get_gradients( autograd.torch
autograd.backward( autograd.is_available( autograd.unicode_literals
mrshenli
(Shen Li)
June 9, 2020, 2:41pm
2
Hey @rahul003 ,
Distributed autograd is using RPC, so you need to call init_rpc
instead of init_process_group
. See the toy example below:
import torch.multiprocessing as mp
import torch.distributed.rpc as rpc
import torch.distributed.autograd as dist_autograd
import os
import torch
from torch import optim
from torch.distributed.optim import DistributedOptimizer
def train():
with dist_autograd.context() as context_id:
t1 = torch.rand((3, 3), requires_grad=True)
t2 = torch.rand((3, 3), requires_grad=True)
loss = t1 + t2
dist_autograd.backward(context_id, [loss.sum()])
grads = dist_autograd.get_gradients(context_id)
print(grads[t1])
print(grads[t2])
def run_worker(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29500'
if rank == 1:
rpc.init_rpc("worker0", rank=rank, world_size=world_size)
train()
else:
rpc.init_rpc("worker1", rank=rank, world_size=world_size)
pass
# block until all rpcs finish
rpc.shutdown()
if __name__=="__main__":
world_size = 2
mp.spawn(run_worker, args=(world_size, ), nprocs=world_size, join=True)
For more complete tutorials, please see the following links:
RL example and RNN example: https://pytorch.org/tutorials/intermediate/rpc_tutorial.html
Parameter server example: https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html
Distributed pipeline example: https://github.com/pytorch/examples/tree/master/distributed/rpc/pipeline
1 Like