CUDA multiprocessing slow?

Hi all,

Trying to do a little sngle-gpu multiprocessing. Testing this code with and without the cuda calls seems to make a huge difference in runtimes… Is there anything that’s glaringly obvious that I’m just missing? Also, would it be appropriate to use the multiprocessing module intermittently in my model? e.g. the use case I am thinking about is:

  1. Single Datastream comes in
  2. Parallel processing of datastream by separate models
  3. Compare those results for single output of model and revert to stage 1.

i

mport torch
import torch.cuda
import torch.multiprocessing as _mp
import torch.nn as nn
mp = _mp.get_context('spawn')
import os
import time as tm

def squarer(num):
        result = num**2


def Qsquarer(num,scalars,Q,):
        for n,s in zip(num,scalars):
                result = n*s
                Q.put(result)



if __name__ == '__main__':

    nums = [torch.randn(4).share_memory_().cuda() for i in range(6)]
    for n in nums:
        n = n.share_memory_()
    pool = mp.Pool(processes = len(nums))
    t1 = tm.time()
    res = pool.map(squarer,nums)
    print(tm.time()-t1)


    t2 = tm.time()
    procs = []
    for i,n in enumerate(nums):
        proc = mp.Process(target=squarer, args=(n,))
        proc.daemon=True
        procs.append(proc)
        proc.start()
    for p in procs:
        p.join()
    print(tm.time()-t2)

    nums = [torch.randn(4).share_memory_().cuda() for i in range(6)]
    scalars = [torch.randn(1).share_memory_().cuda() for i in range(6)]
    for n in nums:
        n = n.share_memory_()
    for s in scalars:
        s = s.share_memory_()

    t1 = tm.time()
    Q = mp.Queue()
    proc = mp.Process(target=Qsquarer, args=(nums,scalars,Q))
    proc.start()
    Q.close()
    Q.join_thread()
    proc.join()
    print(tm.time()-t1)