I’m consistently getting segfaults with the following code:
import torch.multiprocessing as mp
def main(pidish, q):
print(q.get())
if __name__ == '__main__':
q = mp.Queue()
mp.spawn(main, args=(q,), nprocs=1)
I’ve tried this with PyTorch 1.4 and 1.5 in a fresh conda environment on two separate Ubuntu 18.04 machines and get segfaults like below:
Traceback (most recent call last):
File "test.py", line 10, in <module>
mp.spawn(main, args=(q,), nprocs=1)
File "/home/sharvil/.miniconda2/envs/torch-1.5/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/sharvil/.miniconda2/envs/torch-1.5/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
while not context.join():
File "/home/sharvil/.miniconda2/envs/torch-1.5/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 106, in join
raise Exception(
Exception: process 0 terminated with signal SIGSEGV
I’m expecting a deadlock since the queue is empty but surely this trivial example shouldn’t segfault… What am I missing here? Can anyone else repro this?
Well, it looks like this happens because the Queue is created using the default start_method (fork on Linux) whereas torch.multiprocessing.spawn() uses the spawn internally (ignoring the default).
On a related note, librosa brings in a dependency that calls multiprocessing.set_start_method on import. Since that method can only be called once, you can’t actually set the default to ‘spawn’. Instead, you’ll have to create a multiprocessing context and use that every time you want to construct a Queue.
I am also getting this error but my script is super basic…how does one fix this?
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def example(rank, world_size):
# create default process group
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(10, 10).to(rank)
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
# forward pass
outputs = ddp_model(torch.randn(20, 10).to(rank))
labels = torch.randn(20, 10).to(rank)
# backward pass
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()
def main():
# world_size = 2
world_size = torch.cuda.device_count()
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__=="__main__":
main()
print('Done\n\a')
err
Traceback (most recent call last):
File "playground/multiprocessing_playground/ddp_hello_world.py", line 42, in <module>
main()
File "playground/multiprocessing_playground/ddp_hello_world.py", line 36, in main
mp.spawn(example,
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 105, in join
raise Exception(
Exception: process 0 terminated with signal SIGSEGV