I’m currently working in rust using the tch crate which is wrapper over pytorch. I would like to multiply two 4096 x 4096 matrix. So during benchmarking I found that with type float32 it is taking more time than int8 on CPU. Ideally int8 should be running faster. CPU also supports the avx512 & amx instruction. During deep dive found that for float32 it is utilizing the all the physical cores available but for int8 it is only utilizing the single core.
Machine configuration:
Libtorch version: 2.5.1
24 core, 96 GB RAM (INTEL(R) XEON(R) PLATINUM 8581C CPU @ 2.30GHz)
CPU flags:
fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushoptclwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 wbnoinvd arat avx512vbmi umip avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm md_clear serialize tsxldtrk amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities
Envrionment variable config
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/libtorch/lib
export LIBTORCH=/usr/local/libtorch
export LIBTORCH_BYPASS_VERSION_CHECK=true
export LD_PRELOAD=/opt/intel/oneapi/compiler/2025.0/lib/libiomp5.so:$LD_PRELOAD
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so:$LD_PRELOAD
export KMP_AFFINITY=granularity=fine,compact,1,0
export KMP_BLOCKTIME=1
Code
use std::time::Instant;
use tch::{Kind, Tensor};
fn benchmark_matrix_multiplication(dtype: Kind, size: (i64, i64)) -> f64 {
// Generate two random matrices
let (rows, cols) = size;
let a = match dtype {
Kind::Int8 => Tensor::randint(10, &[rows, cols], (dtype, tch::Device::Cpu)),
Kind::Float => Tensor::randn(&[rows, cols], (dtype, tch::Device::Cpu)),
_ => panic!("Unsupported data type"),
};
let b = match dtype {
Kind::Int8 => Tensor::randint(10, &[cols, rows], (dtype, tch::Device::Cpu)),
Kind::Float => Tensor::randn(&[cols, rows], (dtype, tch::Device::Cpu)),
_ => panic!("Unsupported data type"),
};
// Warm-up loop
for _ in 0..5 {
let _ = a.matmul(&b);
}
// Benchmark
let start = Instant::now();
let _ = a.matmul(&b);
let duration = start.elapsed();
duration.as_secs_f64()
}
fn main() {
let size = (4096, 4096);
// Benchmark int8 matrix multiplication
let int8_time = benchmark_matrix_multiplication(Kind::Int8, size);
println!(
"Time taken for 4096x4096 int8 matrix multiplication: {:.6} seconds",
int8_time
);
// Benchmark float32 matrix multiplication
let float_time = benchmark_matrix_multiplication(Kind::Float, size);
println!(
"Time taken for 4096x4096 float32 matrix multiplication: {:.6} seconds",
float_time
);
}
Output
Time taken for 4096x4096 int8 matrix multiplication: 2.225644 seconds
Time taken for 4096x4096 float32 matrix multiplication: 0.033788 seconds