import torch.multiprocessing as mp
import torch
def main(local_rank):
p = mp.Process(target=dummy)
p.start()
x = torch.ones(1).cuda()
y = torch.ones(1).cuda()
while True:
x*y
print("main")
p.join()
def dummy():
x = torch.ones(1).cuda()
y = torch.ones(1).cuda()
while True:
print("dummy")
x*y
if __name__ == "__main__":
#main()
mp.spawn(main, nprocs=torch.cuda.device_count())
This code gives the following error
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
which is very weird. How can this small tensor gets out of memory?
I need a subprocess that will not block my main process which uses DDP.
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
dummy
main
...
Sorry. I pasted the wrong example. Should be the following
import torch.multiprocessing as mp
import torch
def main(local_rank):
p = mp.Process(target=dummy)
p.start()
model = torch.nn.Linear(1,1)
model = torch.nn.DistributedDataParallel(model)
x = torch.ones(1).cuda()
y = torch.ones(1).cuda()
while True:
x*model(x)
print("main")
p.join()
def dummy():
x = torch.ones(1).cuda()
y = torch.ones(1).cuda()
while True:
print("dummy")
x*y
if __name__ == "__main__":
#main()
mp.spawn(main, nprocs=torch.cuda.device_count())
The main process is not running. I need to use DDP along with a subprocess processing CUDA tensors.
The behavior also looks different when run with CUDA_VISIBLE_DEVICES=0 … and CUDA_VISIBLE_DEVICES=0,1,2,3 …