Segfault with multiprocessing + queue

sharvil · May 15, 2020, 1:53am

I’m consistently getting segfaults with the following code:

import torch.multiprocessing as mp


def main(pidish, q):
  print(q.get())

if __name__ == '__main__':
  q = mp.Queue()
  mp.spawn(main, args=(q,), nprocs=1)

I’ve tried this with PyTorch 1.4 and 1.5 in a fresh conda environment on two separate Ubuntu 18.04 machines and get segfaults like below:

Traceback (most recent call last):
  File "test.py", line 10, in <module>
    mp.spawn(main, args=(q,), nprocs=1)
  File "/home/sharvil/.miniconda2/envs/torch-1.5/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 200, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/home/sharvil/.miniconda2/envs/torch-1.5/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 158, in start_processes
    while not context.join():
  File "/home/sharvil/.miniconda2/envs/torch-1.5/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 106, in join
    raise Exception(
Exception: process 0 terminated with signal SIGSEGV

I’m expecting a deadlock since the queue is empty but surely this trivial example shouldn’t segfault… What am I missing here? Can anyone else repro this?

sharvil · May 15, 2020, 2:28am

Well, it looks like this happens because the Queue is created using the default start_method (fork on Linux) whereas torch.multiprocessing.spawn() uses the spawn internally (ignoring the default).

On a related note, librosa brings in a dependency that calls multiprocessing.set_start_method on import. Since that method can only be called once, you can’t actually set the default to ‘spawn’. Instead, you’ll have to create a multiprocessing context and use that every time you want to construct a Queue.

What a mess.

SimonW · May 15, 2020, 2:34am

see https://github.com/librosa/librosa/issues/747 for the librosa bug

Brando_Miranda · February 18, 2021, 9:48pm

I am also getting this error but my script is super basic…how does one fix this?

import os

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP


def example(rank, world_size):
    # create default process group
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'
    dist.init_process_group("gloo", rank=rank, world_size=world_size)

    # create local model
    model = nn.Linear(10, 10).to(rank)
    # construct DDP model
    ddp_model = DDP(model, device_ids=[rank])
    # define loss function and optimizer
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    # forward pass
    outputs = ddp_model(torch.randn(20, 10).to(rank))
    labels = torch.randn(20, 10).to(rank)
    # backward pass
    loss_fn(outputs, labels).backward()
    # update parameters
    optimizer.step()

def main():
    # world_size = 2
    world_size = torch.cuda.device_count()
    mp.spawn(example,
        args=(world_size,),
        nprocs=world_size,
        join=True)

if __name__=="__main__":
    main()
    print('Done\n\a')

err

Traceback (most recent call last):
  File "playground/multiprocessing_playground/ddp_hello_world.py", line 42, in <module>
    main()
  File "playground/multiprocessing_playground/ddp_hello_world.py", line 36, in main
    mp.spawn(example,
  File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
    while not context.join():
  File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 105, in join
    raise Exception(
Exception: process 0 terminated with signal SIGSEGV