I’m observing different numerical results when running the same quantization kernel in CUDA Graph mode versus normal execution mode.
__global__
void scale_andquant_kernel(
const half *__restrict__ x,
half *__restrict__ scale,
const float input_clip_ratio,
uint32_t colsSrc,
uint32_t colsDst,
Int4Storage *__restrict__ q
)
{
uint32_t row = threadIdx.y + blockIdx.y * blockDim.y;
half maxVal = __habs(x[row * colsSrc]);
for(int tid = threadIdx.x;tid<colsSrc;tid+=blockDim.x)
{
half val = __habs(x[tid+row*colsSrc]);
if (val > maxVal) {
maxVal = val;
}
}
const size_t lane_id = threadIdx.x % 32;
const size_t warp_id = threadIdx.x / 32;
#pragma unroll
for (int mask = 32 / 2;mask > 0;mask /= 2){
maxVal = fmaxf(maxVal, __shfl_xor_sync(0xFFFFFFFF, maxVal, mask));
}
__shared__ float smem_max[32];
if (lane_id == 0){
smem_max[threadIdx.x/32]=__half2float(maxVal);
}
__syncthreads();
// __shared__ half scale_row;
if (warp_id==0){
float value = smem_max[threadIdx.x];
#pragma unroll
for (int mask = 32 / 2;mask > 0;mask /= 2){
value = fmaxf(value, __shfl_xor_sync(0xFFFFFFFF, value, mask));
}
if(lane_id==0){
// scale_row=__hmul(__hdiv(__float2half(value), __float2half(7.0f)), __float2half(input_clip_ratio));
// scale[row]=scale_row;
scale[row]=__hmul(__hdiv(__float2half(value), __float2half(7.0f)), __float2half(input_clip_ratio));;
}
}
__syncthreads();
for(int tid = threadIdx.x;tid<colsDst;tid+=blockDim.x)
{
Int4Storage storage;
memset(&storage, 0, sizeof(storage));
uint32_t id = tid * kElementsPerVector + row * colsSrc;
#pragma unroll
for (int i = 0; i < kElementsPerVector; ++i)
{
bool safe = (tid * kElementsPerVector + i) < colsSrc;
if (safe)
{
half data = __hdiv(x[id + i], scale[row]);
int qval = clamp(__half2int_rn(data), qmin, qmax);
Int4Subbyte{reinterpret_cast<cutlass::int4b_t *>(&storage), i}.set(
qval);
}
}
q[tid + row * colsDst] = storage;
}
}
void sym_quantfuse_host(
const half *x,
half *__restrict__ scale,
const float input_clip_ratio,
uint32_t rows,
uint32_t colsSrc,
uint32_t colsDst,
Int4Storage *q
)
{
dim3 block{1024, std::min<uint32_t>(rows, 1)};
dim3 grid{1, cdiv(rows, block.y)};
scale_andquant_kernel<<<grid, block>>>(x,scale,input_clip_ratio, colsSrc, colsDst, q);
}
"The CUDA Graph capture code is as follows:
import torch
import torch.nn as nn
import time
class ModelRunner():
def __init__(self, model):
self.model = model
self.graph_runners = {} # (int, CUDAGraphRunner)
@torch.inference_mode()
def capture_model(self):
for batch in [1, 2, 3, 4]:
input = torch.randn((batch, N),dtype=torch.half,device="cuda").cuda()
graph_runner = CUDAGraphRunner(self.model)
graph_runner.capture(input)
self.graph_runners[batch] = graph_runner
@torch.inference_mode()
def execute_model(self, x):
batch = x.size(0)
if batch in self.graph_runners:
model_executable = self.graph_runners[batch]
else:
print(f"warning, no cudagraph_runner, back to origin model")
model_executable = self.model
return model_executable(x)
class CUDAGraphRunner():
def __init__(self, model):
self.model = model
self.cuda_graph = None
self.graph_input = None
self.graph_output = None
def capture(self, x):
assert self.cuda_graph is None
self.cuda_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(self.cuda_graph):
out = self.model(x)
torch.cuda.synchronize()
self.graph_input = x
self.graph_output = out
def forward(self, x):
self.graph_input.copy_(x)
self.cuda_graph.replay()
return self.graph_output
def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)
class model_w4a4(nn.Module):
def __init__(self,m,n,k):
super().__init__()
self.weight = torch.randint(1,7,(k,n//2),dtype=torch.uint8,device="cuda")
self.weight_scale = torch.rand((k,1),dtype=torch.half,device="cuda")
self.scale = torch.randn((m,1),dtype=torch.half, device="cuda")
self.q = torch.randint(1,7,(m,n//2),dtype=torch.uint8,device="cuda")
print(self.q)
def forward(self,input):
q,scale = sym_quant_fuse(input)
# c =matmul(self.q,self.weight)
#x =sym_dequant(c,self.scale,self.weight_scale)
return scale
M = 1
N = 3584
K = 3584+512*2
# model = nn.Linear(N,K).cuda()
# model = model.to(torch.half)
model = model_w4a4(M,N,K)
model.eval()
input = torch.randn((M,N),dtype=torch.half,device="cuda")
output_ref = model(input)
scale_gt = (torch.max(torch.abs(input), dim=-1)[0].unsqueeze(1)/7).to(torch.float16)
print("scalegt:",scale_gt)
model_runner = ModelRunner(model)
model_runner.capture_model()
time.sleep(5)
output = model_runner.execute_model(input)
print(output_ref)
print(output)
The ‘sym_quant_fuse’ serves as the Python interface to the C function.how to identify what issues in the aforementioned CUDA kernel might cause computational inconsistencies in CUDA Graph mode?