`Exception: process 0 terminated with exit code 1` error when using `torch.multiprocessing.spawn` to parallelize over multiple GPUs

mrshenli · July 27, 2020, 7:33pm

You can use share_momery_() and torch.multiprocessing.SimpleQueue to implement IPC. E.g.:

import numpy as np
import torch
import torch.multiprocessing as mp


def func(rank, x, p2c, c2p):
    x_power = x.to(rank) ** rank
    c2p.put(x_power)
    # citing multiprocessing doc: Unlike CPU tensors, the 
    # sending process is required to keep the original tensor 
    # as long as the receiving process retains a copy of 
    # the tensor. The refcounting is implemented under the 
    # hood but requires users to follow the next best practices.
    p2c.get()
    print(f"child-{rank} done")

if __name__ == '__main__':
    nprocs = 2
    x = torch.ones(2, 2)
    x.share_memory_()
    ctx = mp.get_context('spawn')
    c2p, p2c = ctx.SimpleQueue(), ctx.SimpleQueue()
    ps = [ctx.Process(target=func, args=(rank, x, p2c, c2p)) for rank in range(nprocs)]
    [p.start() for p in ps]
    tensors = [c2p.get() for _ in range(nprocs)]
    print(tensors)
    del tensors
    for p in ps:
        p2c.put(0)
        p.join()
    print("parent done")