Sorry to bother you, I recently work with torch.multiprocessing and have some problems,hope you can help me😶🌫️
- If I shared a model in cuda, it raises
RuntimeError: Attempted to send CUDA tensor received from another process; this is not currently supported. Consider cloning before sending.
torch.multiprocessing.manager.queue.get
taken a long time to finish. If the queue just passed a file descriptor, I don’t think it should take 1/3 of the total time, is there any faster way?
Here’s my script
import torch
import torch.multiprocessing as mp
from copy import deepcopy
from functools import partial
from time import *
from torchvision import models
import numpy as np
from tqdm import tqdm
def parallel_produce(
queue: mp.Queue,
model_method,
i
) -> None:
pure_model: torch.nn.Module = model_method()
# if you delete this line, model can be passed
pure_model.to('cuda')
pure_model.share_memory()
while True:
corrupt_model = deepcopy(pure_model)
dic = corrupt_model.state_dict()
dic[list(dic.keys())[0]]*=2
corrupt_model.share_memory()
queue.put(corrupt_model)
def parallel(
valid,
iteration: int = 1000,
process_size: int=2,
buffer_size: int=2
):
pool = mp.Pool(process_size)
manager = mp.Manager()
queue = manager.Queue(buffer_size)
SeedSequence = np.random.SeedSequence()
model_method = partial(models.squeezenet1_1,True)
async_result = pool.map_async(
partial(
parallel_produce,
queue,
model_method,
),
SeedSequence.spawn(process_size),
)
time = 0
for iter_times in tqdm(range(iteration)):
start = monotonic_ns()
# this takes a long time
corrupt_model: torch.nn.Module = queue.get()
time += monotonic_ns() - start
corrupt_model.to("cuda")
corrupt_result = corrupt_model(valid)
del corrupt_model
pool.terminate()
print(time / 1e9)
if __name__ == "__main__":
valid = torch.randn(1,3,224,224).to('cuda')
parallel(valid)
#total time of queue.get taken