Dear all
System Info
OS. Ubuntu 18.03
Ver. pytorch 1.5.0 via conda
Summary
- torch.multiprocessing with Event and Queue outputs the correct values of queue only if the method of multiprocessing is “fork”.
- The other two methods “spawn” and “forkserver” give errors.
My question is:
Q1. Does this phenomena depend on the OS ? In other words, Mac or Windows has default start method “spawn”, but Linux has “fork” . These default setting causes this errors in Linux for non-default methods like spawn and forkserver ??
Q2. If the answer to the Q1 is NO, for Linux, can I modify the following snipet to work with the other start method “forkserver” or “spawn” ?
Q3. Generally speaking, which method has advantages in the aspect of performance? Which one would be bug free ?
Example Snipet
I am testing very simple script as follows.
import torch.multiprocessing as mp
import torch
from time import sleep
import random
from torch import nn
class MLP(nn.Module):
def __init__(self,):
super().__init__()
self.l=nn.Linear(4,2)
def forward(self, x):
return self.l(x)
class A(object):
def __init__(self, *args, **kwargs):
# do other stuff
pass
def put_in_q(self, model, xxx, idx, q, evt):
with torch.no_grad():
print("before_forward:")
y = model(xxx[idx,:].to("cpu"))
print("after_forward:",y.shape)
q.put(y.clone())
#q.put((-1.)*y.clone())
evt.wait()
def run(self):
q = mp.Queue()
xxx=torch.randn((8,4))
print(xxx)
model=MLP()
model.share_memory()
es=[]
ps=[]
for i in range(4):
e = mp.Event()
es.append(e)
p = mp.Process(target=self.put_in_q, args=(model, xxx, i, q, e))
ps.append(p)
for p in ps:
p.start()
ll = []
for p, e in zip(ps, es):
x = q.get()
e.set()
ll.extend(x)
print(x)
print(ll)
for p in ps:
p.join()
if __name__ == '__main__':
mp.set_start_method("forkserver")
sleep(3)
a = A()
a.run()
Output for the start method forkserver and spawn
python sand_mp.py
tensor([[ 0.3348, -1.2975, -1.3644, -0.6104],
[ 2.1064, -1.3236, -0.3818, -0.2861],
[ 2.2618, -0.2790, 0.0428, 0.7381],
[-0.1307, -0.7346, 1.0275, 1.6364],
[ 0.8133, 0.7689, -1.2899, 1.4351],
[-1.2942, -0.5722, 0.2302, 0.8681],
[-1.5618, -0.8937, -2.2884, 0.9952],
[-0.3108, 0.9760, -0.5666, 1.1212]])
before_forward:
after_forward: torch.Size([2])
tensor([ 0.1457, -0.1567])
before_forward:
after_forward: torch.Size([2])
tensor([ 0.6234, -0.3702])
before_forward:
after_forward: torch.Size([2])
Traceback (most recent call last):
File "sand_mp.py", line 66, in <module>
a.run()
File "sand_mp.py", line 51, in run
x = q.get()
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/queues.py", line 113, in get
return _ForkingPickler.loads(res)
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 294, in rebuild_storage_fd
fd = df.detach()
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 498, in Client
answer_challenge(c, authkey)
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 741, in answer_challenge
message = connection.recv_bytes(256) # reject large message
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError
Traceback (most recent call last):
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/forkserver.py", line 261, in main
old_handlers)
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/forkserver.py", line 297, in _serve_one
code = spawn._main(child_r)
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/spawn.py", line 115, in _main
self = reduction.pickle.load(from_parent)
File "/home/a-saito/.pyenv/versions/miniconda3-3.9.1/envs/pt37sand/lib/python3.7/multiprocessing/synchronize.py", line 110, in __setstate__
self._semlock = _multiprocessing.SemLock._rebuild(*state)
FileNotFoundError: [Errno 2] No such file or directory
Output for start method fork (the default method in Linux)
tensor([[-1.0548, -0.5989, 0.6931, 0.9045],
[-0.6635, -0.6829, -1.6791, -1.3812],
[ 0.1501, -0.1146, 0.1804, -1.0176],
[ 1.2192, 0.4320, 0.0246, 0.2755],
[ 0.7326, 1.1502, 0.3816, -0.3988],
[ 0.9414, 0.1373, 0.2629, -0.2126],
[ 0.7675, -1.4352, -0.3587, 0.1965],
[-0.8086, 0.4615, -0.3425, 0.0837]])
before_forward:
after_forward: torch.Size([2])
before_forward:
after_forward: torch.Size([2])
before_forward:
after_forward: torch.Size([2])
before_forward:
after_forward: torch.Size([2])
tensor([0.5928, 0.2589])
tensor([-0.0896, 0.9283])
tensor([-0.0155, 0.7434])
tensor([0.2523, 0.0981])
[tensor(0.5928), tensor(0.2589), tensor(-0.0896), tensor(0.9283), tensor(-0.0155), tensor(0.7434), tensor(0.2523), tensor(0.0981)]