Tensor beneath data access is not in accordance with print result

When I am writing a class inherited from torch.autograd.Function, I found that in backward function, the data printed out in python([[[1., 1., 1., 1., 1., 1., 1., 1.]]]) and fetched in beneath the cuda code([[[1., 0., 0., 0., 0., 0., 0., 0.]]]) are not the same unless I clone the output_grad. I wonder what might be problem there. Thanks.

class FusedRotaryEmbeddingFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, cos, sin, position_ids, tensor_index, k_size, rotary_size, base):
        ctx.save_for_backward(cos, sin, position_ids)
        ctx.tensor_index = tensor_index
        ctx.k_size = k_size
        ctx.rotary_size = rotary_size
        ctx.base = base
        return fused_apply_rotary_emb_cuda.forward(x, cos, sin, position_ids, tensor_index, k_size, rotary_size, base)

    @staticmethod
    def backward(ctx, output_grad):
        cos, sin, position_ids = ctx.saved_tensors
        tensor_index = ctx.tensor_index
        k_size = ctx.k_size
        rotary_size = ctx.rotary_size
        base = ctx.base

        # incorrect result, unless I input output_grad.clone()
        x_grad = fused_apply_rotary_emb_cuda.backward(output_grad, cos, sin, position_ids, tensor_index, k_size, rotary_size, base)

        return (x_grad, None, None, None, None, None, None, None)