Why embedding_bag result is different between GPU and CPU impl

when indices and offset are all 2D,

CPU:

GPU :

Could you post a minimal and executable code snippet by wrapping them into three backticks ``` so we could try to reproduce and debug the issue?

import torch

# check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

# move to GPU
weight = torch.tensor([[1,1,1,1,1],[2,2,2,2,2],[1,2,3,4,5],[1,2,3,4,5],[1,2,3,4,5]]).to(torch.float).to(device)
indices = torch.tensor([[1, 2, 3, 4],[1, 2, 3, 4]]).to(torch.int32).to(device)
offset = torch.tensor([[0, 2],[2,3]]).to(device)

scale_grad_by_freq = False
per_sample_weights = None
sparse = False
include_last_offset = False
pad_idx = None
# sum=0, mean=1, max=2
mode = 0

print("==================== mode=sum:")
print("weight \n", weight)
print("indices \n", indices)
print("offset \n", offset)
res1 = torch.ops.aten.embedding_bag(weight=weight, indices=indices, offsets=offset,
                                    scale_grad_by_freq=scale_grad_by_freq,
                                    mode=mode, sparse=sparse,
                                    per_sample_weights=per_sample_weights,
                                    include_last_offset=include_last_offset,
                                    padding_idx=pad_idx)
print("res1 \n", res1)

hello, can you help to check the code? Thanks