How to wait on `non_blocking` copying from GPU to CPU?

Yes, you are right and you would need to synchronize the current stream e.g. via:

if __name__ == '__main__':
    seed = 0
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    stream = torch.cuda.current_stream()

    x = torch.rand(32, 256, 220, 220).cuda()

    t = (x.min() - x.max()).to(torch.device("cpu"), non_blocking=True)
    print(stream.query()) # False - work not done yet
    stream.synchronize() # wait for stream to finish the work
    print(t)
    
    time.sleep(2.)
    print(stream.query()) # True - work done
    print(t)
1 Like