Multiprocessing gives errors with event & queue for two methods: forkserver & spawn

Atsu3110 · June 5, 2020, 3:04pm

Dear all

System Info

OS. Ubuntu 18.03
Ver. pytorch 1.5.0 via conda

Summary

torch.multiprocessing with Event and Queue outputs the correct values of queue only if the method of multiprocessing is “fork”.
The other two methods “spawn” and “forkserver” give errors.

My question is:
Q1. Does this phenomena depend on the OS ? In other words, Mac or Windows has default start method “spawn”, but Linux has “fork” . These default setting causes this errors in Linux for non-default methods like spawn and forkserver ??

Q2. If the answer to the Q1 is NO, for Linux, can I modify the following snipet to work with the other start method “forkserver” or “spawn” ?

Q3. Generally speaking, which method has advantages in the aspect of performance? Which one would be bug free ?

Example Snipet

I am testing very simple script as follows.

import torch.multiprocessing as mp
import torch
from time import sleep
import random
from torch import nn

class MLP(nn.Module):
    def __init__(self,):
        super().__init__()
        self.l=nn.Linear(4,2)
        
    def forward(self, x):
        return self.l(x)

class A(object):
    def __init__(self, *args, **kwargs):
        # do other stuff
        pass

    def put_in_q(self, model, xxx, idx, q, evt):
        with torch.no_grad():
            print("before_forward:")
            y = model(xxx[idx,:].to("cpu"))
            print("after_forward:",y.shape)

        
        q.put(y.clone())
        #q.put((-1.)*y.clone())
        evt.wait()

    def run(self):
        q = mp.Queue()
        xxx=torch.randn((8,4))
        print(xxx)

        model=MLP()
        model.share_memory()
        es=[]
        ps=[]
        for i in range(4):
            e = mp.Event()
            es.append(e)
            p = mp.Process(target=self.put_in_q, args=(model, xxx, i, q, e))
            ps.append(p)

        for p in ps:
            p.start()

        ll = []
        for p, e in zip(ps, es):
            x = q.get()
            e.set()            
            ll.extend(x)
            print(x)

        print(ll)

        for p in ps:
            p.join()


if __name__ == '__main__':
    mp.set_start_method("forkserver")
    sleep(3)
    a = A()
    a.run()

Output for the start method forkserver and spawn

python sand_mp.py 
tensor([[ 0.3348, -1.2975, -1.3644, -0.6104],
        [ 2.1064, -1.3236, -0.3818, -0.2861],
        [ 2.2618, -0.2790,  0.0428,  0.7381],
        [-0.1307, -0.7346,  1.0275,  1.6364],
        [ 0.8133,  0.7689, -1.2899,  1.4351],
        [-1.2942, -0.5722,  0.2302,  0.8681],
        [-1.5618, -0.8937, -2.2884,  0.9952],
        [-0.3108,  0.9760, -0.5666,  1.1212]])
before_forward:
after_forward: torch.Size([2])
tensor([ 0.1457, -0.1567])
before_forward:
after_forward: torch.Size([2])
tensor([ 0.6234, -0.3702])
before_forward:
after_forward: torch.Size([2])
Traceback (most recent call last):
  File "sand_mp.py", line 66, in <module>
    a.run()
  File "sand_mp.py", line 51, in run
    x = q.get()
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 294, in rebuild_storage_fd
    fd = df.detach()
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process.current_process().authkey)
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 498, in Client
    answer_challenge(c, authkey)
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 741, in answer_challenge
    message = connection.recv_bytes(256)         # reject large message
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 383, in _recv
    raise EOFError
EOFError
Traceback (most recent call last):
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/forkserver.py", line 261, in main
    old_handlers)
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/forkserver.py", line 297, in _serve_one
    code = spawn._main(child_r)
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/spawn.py", line 115, in _main
    self = reduction.pickle.load(from_parent)
  File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/synchronize.py", line 110, in __setstate__
    self._semlock = _multiprocessing.SemLock._rebuild(*state)
FileNotFoundError: [Errno 2] No such file or directory

Output for start method fork (the default method in Linux)

tensor([[-1.0548, -0.5989,  0.6931,  0.9045],
        [-0.6635, -0.6829, -1.6791, -1.3812],
        [ 0.1501, -0.1146,  0.1804, -1.0176],
        [ 1.2192,  0.4320,  0.0246,  0.2755],
        [ 0.7326,  1.1502,  0.3816, -0.3988],
        [ 0.9414,  0.1373,  0.2629, -0.2126],
        [ 0.7675, -1.4352, -0.3587,  0.1965],
        [-0.8086,  0.4615, -0.3425,  0.0837]])
before_forward:
after_forward: torch.Size([2])
before_forward:
after_forward: torch.Size([2])
before_forward:
after_forward: torch.Size([2])
before_forward:
after_forward: torch.Size([2])
tensor([0.5928, 0.2589])
tensor([-0.0896,  0.9283])
tensor([-0.0155,  0.7434])
tensor([0.2523, 0.0981])
[tensor(0.5928), tensor(0.2589), tensor(-0.0896), tensor(0.9283), tensor(-0.0155), tensor(0.7434), tensor(0.2523), tensor(0.0981)]