Got wrong tensor when using dist.send to send tensors

Hi, I am using torch.linalg.qr to produce some tensors and send then by using dist.send.
But the sequence of the element is not the same as before, why?
Here is the code

import torch
import torch.multiprocessing as mp
import torch.distributed as dist

def main_worker(rank, world_size, args):
    dist.init_process_group(
        backend="nccl",
        init_method="tcp://127.0.0.1:9001",
        world_size=world_size,
        rank=rank,
    )
    print("process begin", rank)

    if rank == 0:

        q_buffer = torch.rand([4,4]).to(0)      
        Q,R = torch.linalg.qr(q_buffer)
        print("local before q_buffer",Q)
        dist.send(Q,1)


                    


    elif rank == 1:

        q_buffer = torch.rand([4,4]).to(1)

        dist.recv(q_buffer,0)

        print("recv",q_buffer,q_buffer.shape)


def main():
    mp.spawn(main_worker, nprocs=2, args=(2, 2))


if __name__ == "__main__":
    main()

The result is

local before q_buffer tensor([[-0.4076,  0.5155,  0.4838, -0.5780],
        [-0.4182, -0.6807, -0.2695, -0.5378],
        [-0.4326,  0.4825, -0.7545,  0.1039],
        [-0.6869, -0.1954,  0.3522,  0.6049]], device='cuda:0')
recv tensor([[-0.4076, -0.4182, -0.4326, -0.6869],
        [ 0.5155, -0.6807,  0.4825, -0.1954],
        [ 0.4838, -0.2695, -0.7545,  0.3522],
        [-0.5780, -0.5378,  0.1039,  0.6049]], device='cuda:1') torch.Size([4, 4])

You need to specify the dtype of the container tensor (q_buffer in your case) to the same with Q.

Please check the following issues.