How to check the value of some varients in operators?

I encountered a problem. When I check the result of operator named GridSample, I found it is different with my python script output on the condition ‘padding=0, mode=0, align_corners=1’.
The source code of torch is:

Tensor _grid_sampler_2d_cpu_quantized(
    const Tensor& input,
    const Tensor& grid,
    int64_t interpolation_mode_,
    int64_t padding_mode_,
    bool align_corners) {
  // See NOTE [ grid_sampler Native Functions ].
  // Add checks here in case this is called instead of grid_sampler.
  check_grid_sampler_common(input, grid);
  check_grid_sampler_2d(input, grid);

  auto interpolation_mode =
      static_cast<GridSamplerInterpolation>(interpolation_mode_);
  /* Bilinear interpolation is supported using the fact that we can perform
   * linear interpolations on quantized values without rescaling. */
  TORCH_CHECK(
      interpolation_mode == GridSamplerInterpolation::Bilinear,
      "_grid_sampler_2d_cpu_quantized(): only bilinear interpolation supported")
  auto padding_mode = static_cast<GridSamplerPadding>(padding_mode_);

  int64_t N = input.size(0);
  int64_t C = input.size(1);
  int64_t inp_H = input.size(2);
  int64_t inp_W = input.size(3);
  int64_t out_H = grid.size(1);
  int64_t out_W = grid.size(2);
  uint8_t zero_point = input.q_zero_point();
  auto output = at::_empty_affine_quantized(
      {N, C, out_H, out_W},
      at::device(c10::kCPU).dtype(c10::kQUInt8),
      input.q_scale(),
      zero_point);
  int64_t inp_sN = input.stride(0);
  int64_t inp_sC = input.stride(1);
  int64_t inp_sH = input.stride(2);
  int64_t inp_sW = input.stride(3);
  int64_t grid_sN = grid.stride(0);
  int64_t grid_sH = grid.stride(1);
  int64_t grid_sW = grid.stride(2);
  int64_t grid_sCoor = grid.stride(3);
  int64_t out_sN = output.stride(0);
  int64_t out_sC = output.stride(1);
  int64_t out_sH = output.stride(2);
  int64_t out_sW = output.stride(3);
  uint8_t* inp_ptr = (uint8_t*)input.data_ptr<quint8>();
  uint8_t* out_ptr = (uint8_t*)output.data_ptr<quint8>();
  float* grid_ptr = grid.data_ptr<float>();
  at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) {
    for (const auto n : c10::irange(start, end)) {
      float* grid_ptr_N = grid_ptr + n * grid_sN;
      uint8_t* inp_ptr_N = inp_ptr + n * inp_sN;
      for (const auto h : c10::irange(out_H)) {
        for (const auto w : c10::irange(out_W)) {
          // get the corresponding input x, y, z co-ordinates from grid
          float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
          float x = *grid_ptr_NHW;
          float y = grid_ptr_NHW[grid_sCoor];

          float ix = grid_sampler_compute_source_index(
              x, inp_W, padding_mode, align_corners);
          float iy = grid_sampler_compute_source_index(
              y, inp_H, padding_mode, align_corners);

          // get corner pixel values from (x, y)
          // for 4d, we use north-east-south-west
          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));

          int64_t ix_ne = ix_nw + 1;
          int64_t iy_ne = iy_nw;

          int64_t ix_sw = ix_nw;
          int64_t iy_sw = iy_nw + 1;

          int64_t ix_se = ix_nw + 1;
          int64_t iy_se = iy_nw + 1;

          // get surfaces to each neighbor:
          float nw = (ix_se - ix) * (iy_se - iy);
          float ne = (ix - ix_sw) * (iy_sw - iy);
          float sw = (ix_ne - ix) * (iy - iy_ne);
          float se = (ix - ix_nw) * (iy - iy_nw);

          // calculate bilinear weighted pixel value and set output pixel
          uint8_t* inp_ptr_NC = inp_ptr_N;
          uint8_t* out_ptr_NCHW =
              out_ptr + n * out_sN + h * out_sH + w * out_sW;
          for (int64_t c = 0; c < C;
               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
            float res = 0;
            res += within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)
                ? inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw
                : zero_point * nw;
            res += within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)
                ? inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne
                : zero_point * ne;
            res += within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)
                ? inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw
                : zero_point * sw;
            res += within_bounds_2d(iy_se, ix_se, inp_H, inp_W)
                ? inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se
                : zero_point * se;
            *out_ptr_NCHW = std::round(res);
          }
        }
      }
    }
  });
  return output;
}

I reproduce above in python, only check one couple of cordinates (-1, -0.6) and 4 pixel value in fact [0, 1, 4, 5]. Code is like below:

import math

def within_bounds_2d(h, w, H, W):
    if 0<=h<H and 0<=w<W:
        return True

x = -1
y = -0.6

ix = ((x + 1) / 2) * (4 - 1)
iy = ((y + 1) / 2) * (4 - 1)

# ix = ((x + 1) * 4 - 1) / 2
# iy = ((y + 1) * 4 - 1) / 2

ix_nw = math.floor(ix)
print(ix_nw)
iy_nw = math.floor(iy)
print(iy_nw)

ix_ne = ix_nw + 1
iy_ne = iy_nw

ix_sw = ix_nw
iy_sw = iy_nw + 1

ix_se = ix_nw + 1
iy_se = iy_nw + 1
print("x: ")
print(ix_nw, ix_ne, ix_sw, ix_se)
print("y: ")
print(iy_nw, iy_ne, iy_sw, iy_se)

nw = (ix_se - ix) * (iy_se - iy)
ne = (ix - ix_sw) * (iy_sw - iy)
sw = (ix_ne - ix) * (iy - iy_ne)
se = (ix - ix_nw) * (iy - iy_nw)
print(nw, ne, sw, se)

res = 0

if within_bounds_2d(iy_nw, ix_nw, 4, 4):
    print("nw")
    res += 0 * nw
if within_bounds_2d(iy_ne, ix_ne, 4, 4):
    print("ne")
    res += 1 * ne
if within_bounds_2d(iy_sw, ix_sw, 4, 4):
    print("sw")
    res += 4 * sw
if within_bounds_2d(iy_se, ix_se, 4, 4):
    print("se")
    res += 5 * se

print(res)

Pytorch result and script result can’t match. Is there any problem with my script? Could someone please check my code? Thanks.