Unable to use distributed autograd

Hi,

I’m trying to use distributed autograd. But I’m running into this error.
I have initialized the process group with dist.init_process_group()

with dist_autograd.context() as context_id:
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/distributed/autograd/__init__.py", line 33, in __enter__
    self.autograd_context = _new_context()
RuntimeError: Need to initialize distributed autograd using torch.distributed.autograd.init()

But I see no init method in torch.distributed.autograd

>>> from torch.distributed import autograd
autog>>> autograd
<module 'torch.distributed.autograd' from '/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/distributed/autograd/__init__.py'>
>>> autograd.
autograd.DistAutogradContext(  autograd.division              autograd.sys
autograd.absolute_import       autograd.get_gradients(        autograd.torch
autograd.backward(             autograd.is_available(         autograd.unicode_literals
autograd.context(              autograd.print_function
>>> autograd.
autograd.DistAutogradContext(  autograd.division              autograd.sys
autograd.absolute_import       autograd.get_gradients(        autograd.torch
autograd.backward(             autograd.is_available(         autograd.unicode_literals

Hey @rahul003,

Distributed autograd is using RPC, so you need to call init_rpc instead of init_process_group. See the toy example below:

import torch.multiprocessing as mp
import torch.distributed.rpc as rpc
import torch.distributed.autograd as dist_autograd
import os
import torch
from torch import optim
from torch.distributed.optim import DistributedOptimizer


def train():
    with dist_autograd.context() as context_id:
        t1 = torch.rand((3, 3), requires_grad=True)
        t2 = torch.rand((3, 3), requires_grad=True)
        loss = t1 + t2
        dist_autograd.backward(context_id, [loss.sum()])
        grads = dist_autograd.get_gradients(context_id)
        print(grads[t1])
        print(grads[t2])

def run_worker(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    if rank == 1:
        rpc.init_rpc("worker0", rank=rank, world_size=world_size)
        train()
    else:
        rpc.init_rpc("worker1", rank=rank, world_size=world_size)
        pass

    # block until all rpcs finish
    rpc.shutdown()


if __name__=="__main__":
    world_size = 2
    mp.spawn(run_worker, args=(world_size, ), nprocs=world_size, join=True)

For more complete tutorials, please see the following links:

  1. RL example and RNN example: https://pytorch.org/tutorials/intermediate/rpc_tutorial.html
  2. Parameter server example: https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html
  3. Distributed pipeline example: https://github.com/pytorch/examples/tree/master/distributed/rpc/pipeline
1 Like