if __name__ == '__main__':
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
for _ in range(2):
stream = torch.cuda.current_stream()
x = torch.rand(32, 256, 220, 220).cuda()
# t = (x.min() - x.max()).to(torch.device("cpu"), non_blocking=True)
t = (x.min() - x.max()).to("cuda:1", non_blocking=True)
print(stream.query()) # False - Checks if all the work submitted has been completed.
print(t)
stream.synchronize() # wait for stream to finish the work
# time.sleep(2.)
print(stream.query()) # True - work done
print(t)
The .to("cuda:1", non_blocking=True) operation creates unexpected cudaStreamWaitEvent events which forces the synchronization between the two GPUs.
Do you know why is it different from CPU-GPU copy operations? And is there a way to avoid synchronizing the two GPUs when making copies?
ps: I also tried t = (x.min() - x.max()).detach().to("cuda:1", non_blocking=True), it gives the same result
cudaStreamWaitEvent is not synchronizing the GPU, but waits on an event to finish.
In your use case, I guess this code is used:
CUDAStream copy_stream = getCurrentCUDAStream(src_device.index());
if (src_device != dst_device) {
// This is a cross-device copy on the src current stream and dst current
// stream. We perform a two-way barrier between both devices' streams
// before the copy. This ensures that any write-after-write and
// write-after-read dependencies on the destination side are handled, so
// that no one is operating on the dst memory when we perform the copy.
// src waits on dst barrier (src already waits on src)
CUDAEvent dst_ready;
device_guard.set_device(dst_device);
dst_ready.record(getCurrentCUDAStream(dst_device.index()));
device_guard.set_device(src_device);
dst_ready.block(copy_stream);
}
if (src_device != dst_device) {
// dst waits on src barrier (dst already waits on dst). We cannot
// operate on dst's copy until the copy is complete.
// Still on src_device, record stream event
CUDAEvent src_ready;
src_ready.record(copy_stream);
device_guard.set_device(dst_device);
src_ready.block(getCurrentCUDAStream(dst_device.index()));
}
which is used to avoid write-after-write and write-after-read situations.