The CUDA kernel produces different results when running in CUDA Graph mode compared to non-CUDA Graph mode

I’m observing different numerical results when running the same quantization kernel in CUDA Graph mode versus normal execution mode.

__global__
void scale_andquant_kernel(
        const half *__restrict__ x,
        half *__restrict__ scale,
        const float input_clip_ratio,
        uint32_t colsSrc,
        uint32_t colsDst,
        Int4Storage *__restrict__ q
)
{
    uint32_t row = threadIdx.y + blockIdx.y * blockDim.y;
    half maxVal = __habs(x[row * colsSrc]);
    for(int tid = threadIdx.x;tid<colsSrc;tid+=blockDim.x)
    {
        half val = __habs(x[tid+row*colsSrc]);
        if (val > maxVal) {
            maxVal = val;
        }
    }
    const size_t lane_id = threadIdx.x % 32;
    const size_t warp_id = threadIdx.x / 32;
    #pragma unroll
    for (int mask = 32 / 2;mask > 0;mask /= 2){
        maxVal = fmaxf(maxVal, __shfl_xor_sync(0xFFFFFFFF, maxVal, mask));
    }
    __shared__ float smem_max[32];
    if (lane_id == 0){
        smem_max[threadIdx.x/32]=__half2float(maxVal);
    }
    __syncthreads();
    // __shared__ half scale_row;
    if (warp_id==0){
        float value = smem_max[threadIdx.x];
        #pragma unroll
        for (int mask = 32 / 2;mask > 0;mask /= 2){
        value = fmaxf(value, __shfl_xor_sync(0xFFFFFFFF, value, mask));
    }
        if(lane_id==0){
        //    scale_row=__hmul(__hdiv(__float2half(value), __float2half(7.0f)), __float2half(input_clip_ratio));
        //    scale[row]=scale_row;
           scale[row]=__hmul(__hdiv(__float2half(value), __float2half(7.0f)), __float2half(input_clip_ratio));;
        }
    }
    __syncthreads();

    for(int tid = threadIdx.x;tid<colsDst;tid+=blockDim.x)
    {
    Int4Storage storage;
    memset(&storage, 0, sizeof(storage));
    uint32_t id = tid * kElementsPerVector + row * colsSrc;
    #pragma unroll
    for (int i = 0; i < kElementsPerVector; ++i)
    {
        bool safe = (tid * kElementsPerVector + i) < colsSrc;
        if (safe)
        {
            half data = __hdiv(x[id + i], scale[row]);

            int qval = clamp(__half2int_rn(data), qmin, qmax);
            Int4Subbyte{reinterpret_cast<cutlass::int4b_t *>(&storage), i}.set(
                    qval);
        }
    }
    q[tid + row * colsDst] = storage;
    }
}
void sym_quantfuse_host(
        const half *x,
        half *__restrict__ scale,
        const float input_clip_ratio,
        uint32_t rows,
        uint32_t colsSrc,
        uint32_t colsDst,
        Int4Storage *q
)
{

    dim3 block{1024, std::min<uint32_t>(rows, 1)};
    dim3 grid{1, cdiv(rows, block.y)};
    scale_andquant_kernel<<<grid, block>>>(x,scale,input_clip_ratio, colsSrc, colsDst, q);
}

"The CUDA Graph capture code is as follows:

import torch
import torch.nn as nn
import time
class ModelRunner():
    def __init__(self, model):
        self.model = model
        self.graph_runners = {}  # (int, CUDAGraphRunner)

    @torch.inference_mode()
    def capture_model(self):
        for batch in [1, 2, 3, 4]: 
            input = torch.randn((batch, N),dtype=torch.half,device="cuda").cuda()
            graph_runner = CUDAGraphRunner(self.model)
            graph_runner.capture(input)
            self.graph_runners[batch] = graph_runner
    
    @torch.inference_mode()
    def execute_model(self, x):
        batch = x.size(0)
        if batch in self.graph_runners:
            model_executable = self.graph_runners[batch] 
        else:
            print(f"warning, no cudagraph_runner, back to origin model")
            model_executable = self.model 
        return model_executable(x)


class CUDAGraphRunner():
    def __init__(self, model):
        self.model = model
        self.cuda_graph = None
        self.graph_input = None
        self.graph_output = None
    
    def capture(self, x):
        assert self.cuda_graph is None

        self.cuda_graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(self.cuda_graph):
            out = self.model(x)
        torch.cuda.synchronize()

        self.graph_input = x 
        self.graph_output = out 
        
    def forward(self, x):
        self.graph_input.copy_(x)
        self.cuda_graph.replay()
        return self.graph_output
    
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

class model_w4a4(nn.Module):
    def __init__(self,m,n,k):
        super().__init__()
        self.weight = torch.randint(1,7,(k,n//2),dtype=torch.uint8,device="cuda")
        self.weight_scale = torch.rand((k,1),dtype=torch.half,device="cuda")
        self.scale = torch.randn((m,1),dtype=torch.half, device="cuda")
        self.q = torch.randint(1,7,(m,n//2),dtype=torch.uint8,device="cuda")
        print(self.q)
    def forward(self,input):
        q,scale = sym_quant_fuse(input)
        # c =matmul(self.q,self.weight)
        #x =sym_dequant(c,self.scale,self.weight_scale)
        return scale
M = 1
N = 3584
K = 3584+512*2
# model = nn.Linear(N,K).cuda() 
# model = model.to(torch.half)
model = model_w4a4(M,N,K)
model.eval()
input = torch.randn((M,N),dtype=torch.half,device="cuda")
output_ref = model(input)
scale_gt = (torch.max(torch.abs(input), dim=-1)[0].unsqueeze(1)/7).to(torch.float16)
print("scalegt:",scale_gt)
model_runner = ModelRunner(model)
model_runner.capture_model() 
time.sleep(5)
output = model_runner.execute_model(input) 
print(output_ref)
print(output)

The ‘sym_quant_fuse’ serves as the Python interface to the C function.how to identify what issues in the aforementioned CUDA kernel might cause computational inconsistencies in CUDA Graph mode?