PyTorch multiprocessing with CUDA sets tensors to 0

The following code works perfectly on CPU. On CUDA, the second print shows that the weights are all 0.

If I don’t pass l to the pool, it works. If I replace the pool from concurrent.futures with mp.Process weights are still 0.

This happens only on CUDA.

What am I doing wrong?

Python 3.10.9
PyTorch 2.0.0
CUDA 11.7

import torch
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor
from torch import multiprocessing as mp

def goo(l):
    return l(torch.rand(2, device='cuda'))

def run():
    ctx = mp.get_context('spawn')

    l = torch.nn.Linear(2, 2).to('cuda').share_memory()
    print(vars(l))

    pool = ProcessPoolExecutor(1, mp_context=ctx)
    pool.submit(goo, l)

    def foo():
        print(vars(l))

    thread = ThreadPoolExecutor(1)
    thread.submit(foo)

if __name__ == '__main__':
    run()

What pytorch version are you using? I cannot reproduce your issue with a recent of it.

Python 3.10.9
PyTorch 2.0.0
CUDA 11.7

Happens on Windows.

So for you my code doesn’t print 0 weights and bias in the 2nd print?

EDIT: tested also with the latest nightly, still 0.

Btw, this is the output I get. The 2nd print shows that weights and bias are 0.

{'training': True, '_parameters': OrderedDict([('weight', Parameter containing:
tensor([[0.5381, 0.1457],
        [0.2658, 0.5354]], device='cuda:0', requires_grad=True)), ('bias', Parameter containing:
tensor([-0.6313,  0.0193], device='cuda:0', requires_grad=True))]), '_buffers': OrderedDict(), '_non_persistent_buffers_set': set(), '_backward_pre_hooks': OrderedDict(), '_backward_hooks': OrderedDict(), '_is_full_backward_hook': None, '_forward_hooks': OrderedDict(), '_forward_hooks_with_kwargs': OrderedDict(), '_forward_pre_hooks': OrderedDict(), '_forward_pre_hooks_with_kwargs': OrderedDict(), '_state_dict_hooks': OrderedDict(), '_state_dict_pre_hooks': OrderedDict(), '_load_state_dict_pre_hooks': OrderedDict(), '_load_state_dict_post_hooks': OrderedDict(), '_modules': OrderedDict(), 'in_features': 2, 'out_features': 2}
{'training': True, '_parameters': OrderedDict([('weight', Parameter containing:
tensor([[0., 0.],
        [0., 0.]], device='cuda:0', requires_grad=True)), ('bias', Parameter containing:
tensor([0., 0.], device='cuda:0', requires_grad=True))]), '_buffers': OrderedDict(), '_non_persistent_buffers_set': set(), '_backward_pre_hooks': OrderedDict(), '_backward_hooks': OrderedDict(), '_is_full_backward_hook': None, '_forward_hooks': OrderedDict(), '_forward_hooks_with_kwargs': OrderedDict(), '_forward_pre_hooks': OrderedDict(), '_forward_pre_hooks_with_kwargs': OrderedDict(), '_state_dict_hooks': OrderedDict(), '_state_dict_pre_hooks': OrderedDict(), '_load_state_dict_pre_hooks': OrderedDict(), '_load_state_dict_post_hooks': OrderedDict(), '_modules': OrderedDict(), 'in_features': 2, 'out_features': 2}

@colesbury any idea on what could be the issue on windows?

@kumpera, no idea, other than it sounds like a bug with sharing GPU memory across processes on Windows.

Getting this as well, on WSL.

The example posted in the original post seems to produce the correct behavior on WSL2, CUDA12.2, Python 3.10.12 and PyTorch 2.0.1. However, the below produces very odd behavior. It seems placing the module in shared memory resets the variables.

import torch
import threading
from torch import multiprocessing as mp

def goo(l):
    print("entering goo")
    print(list(l.parameters()))
    x = l(torch.rand(2, device='cuda'))
    print(x)
    print("leaving goo")

def run():
    ctx = mp.get_context('spawn')

    l = torch.nn.Linear(2, 2).to('cuda').share_memory()
    
    print("##### regular call #####")
    goo(l)

    print("##### thread call #####")
    thread = threading.Thread(target=goo, args=(l, ))
    thread.start()
    thread.join()

    print("##### process call #####")
    process = ctx.Process(target=goo, args=(l,))
    process.start()
    process.join()

    print("##### thread call #####")
    thread = threading.Thread(target=goo, args=(l, ))
    thread.start()
    thread.join()

    print("##### regular call #####")
    goo(l)

if __name__ == '__main__':
    run()

outputs

##### regular call #####
entering goo
[Parameter containing:
tensor([[ 0.2169, -0.5305],
        [ 0.1662, -0.3463]], device='cuda:0', requires_grad=True), Parameter containing:
tensor([-0.4492,  0.0421], device='cuda:0', requires_grad=True)]
tensor([-0.9618, -0.2915], device='cuda:0', grad_fn=<AddBackward0>)
leaving goo
##### thread call #####
entering goo
[Parameter containing:
tensor([[ 0.2169, -0.5305],
        [ 0.1662, -0.3463]], device='cuda:0', requires_grad=True), Parameter containing:
tensor([-0.4492,  0.0421], device='cuda:0', requires_grad=True)]
tensor([-0.7600, -0.1606], device='cuda:0', grad_fn=<AddBackward0>)
leaving goo
##### process call #####
entering goo
[Parameter containing:
tensor([[0., 0.],
        [0., 0.]], device='cuda:0', requires_grad=True), Parameter containing:
tensor([0., 0.], device='cuda:0', requires_grad=True)]
tensor([0., 0.], device='cuda:0', grad_fn=<AddBackward0>)
leaving goo
##### thread call #####
entering goo
[Parameter containing:
tensor([[0., 0.],
        [0., 0.]], device='cuda:0', requires_grad=True), Parameter containing:
tensor([0., 0.], device='cuda:0', requires_grad=True)]
tensor([0., 0.], device='cuda:0', grad_fn=<AddBackward0>)
leaving goo
##### regular call #####
entering goo
[Parameter containing:
tensor([[0., 0.],
        [0., 0.]], device='cuda:0', requires_grad=True), Parameter containing:
tensor([0., 0.], device='cuda:0', requires_grad=True)]
tensor([0., 0.], device='cuda:0', grad_fn=<AddBackward0>)
leaving goo