Yes, you are right and you would need to synchronize the current stream e.g. via:
if __name__ == '__main__':
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
stream = torch.cuda.current_stream()
x = torch.rand(32, 256, 220, 220).cuda()
t = (x.min() - x.max()).to(torch.device("cpu"), non_blocking=True)
print(stream.query()) # False - work not done yet
stream.synchronize() # wait for stream to finish the work
print(t)
time.sleep(2.)
print(stream.query()) # True - work done
print(t)