Issue with CUDA tensors shared between processes

tommak3 · April 7, 2023, 3:42am

Hi,

I’m seeing issues when sharing CUDA tensors between processes, when they are created using “frombuffer” or “from_numpy” interfaces. It seems like some low lever synchronization might be missing somewhere, and some of the returned tensors are all zeroes.

This is on Windows, CUDA 11.8, Pytorch 2.0. Has anyone seen similar behavior? Is this is known issue? Is there maybe a workaround?

Attaching minimal repro case.

import torch
import torch.multiprocessing
import numpy as np


class WorkerData:
	def __init__( self ):                          
		self.queue = torch.multiprocessing.Queue( 64 )

def worker_loader_fn( worker_data, index ):
	iter = 0;
	while True:
		# all ok
		ar  = bytearray('XYZ', 'utf-8')
		new_tensor = torch.frombuffer( memoryview( ar ), dtype = torch.uint8 )

		# some missing
		#ar  = bytearray('XYZ', 'utf-8')
		#new_tensor = torch.frombuffer( memoryview( ar ), dtype = torch.uint8 ).cuda()

		# some missing
		#numpy_array = np.array( [ iter, iter, iter, iter, iter ] )
		#new_tensor = torch.from_numpy(  numpy_array ).cuda()

		# all ok
		#numpy_array = np.array( [ iter, iter, iter, iter, iter ] )
		#new_tensor = torch.from_numpy(  numpy_array )

		# all ok
		#new_tensor = torch.full( ( 4, ), iter ).cuda()

		worker_data.queue.put( ( index, True if iter < 10 else False, new_tensor ) )

		iter = iter + 1


def main():
	worker_data = WorkerData()
		
	for i in range( 0, 4 ):
		p = torch.multiprocessing.Process( target = worker_loader_fn, args=( worker_data, i ) )
		p.start()
	
	worker_done = torch.full( ( 4, ), False )
	
	while True:
		data = worker_data.queue.get( True )
		worker_index, data_valid, tensor = data
	
		if not data_valid:
			worker_done[worker_index] = True
		else:
			print( "Tensor from {} - {}".format( worker_index, tensor ) )
		
		if worker_done.all():
			break;
	
		
		
if __name__ == '__main__':
    main()

matt.carlson · April 7, 2023, 4:00am

Can you post the error message? I have seen errors when tensors are sent to the GPU outside of the main training loop.

tommak3 · April 7, 2023, 4:12am

There’s no error. Some of the returned tensors are just zeros.
Output from the first variant:

Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], dtype=torch.uint8)

And from the second

Tensor from 3 - tensor([0, 0, 0], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 3 - tensor([0, 0, 0], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 3 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([0, 0, 0], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([0, 0, 0], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([0, 0, 0], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([0, 0, 0], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([0, 0, 0], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([0, 0, 0], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 0 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 2 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)
Tensor from 1 - tensor([88, 89, 90], device=‘cuda:0’, dtype=torch.uint8)

a bunch of first returned tensors is just zeros.