A basic matmul scenario …
I get a segmentation fault in torch.matmul() when multiplying a tensor of 50K x 50K with a vector of length 50K. Float16 dtype. My machine has plenty of free memory (75G with 66G free currently) and this tensor is only about 5G in total. The tensor is purely on the cpu (not gpu). Tried running with OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 .. no difference. Couldnt get much info out of gdb (output from running within gdb below)
Thread 1 "python" received signal SIGSEGV, Segmentation fault. 0x00007fffdf074198 in at::native::AVX2::fp16_dot_with_fp32_arith(c10::Half const*, c10::Half const*, long) () from /home/cloud-user/BM/lib64/python3.12/site-packages/torch/lib/libtorch_cpu.so Missing separate debuginfos, use: dnf debuginfo-install bzip2-libs-1.0.8-8.el9.x86_64 expat-2.5.0-2.el9_4.x86_64 glibc-2.34-100.el9.x86_64 libffi-3.4.2-8.el9.x86_64And the program itself is copied below … works fine until N=10000 and then segmentation fault in torch.matmul() for N = 50000
import torch
import time
import faulthandler
import os
import matplotlib.pyplot as plt
def matrix_vector_operations(N_values):
results = {}
for N in N_values:
# Initialize tensors
try:
A = torch.rand(N, N, dtype=torch.float16, device=“cpu”)
print(" Allocated matrix A for size “, N)
except Exception as e:
print(“Failed to allocate tensor A”)
print(f"Error: {e}”)
sys.exit(0)
try:
X = torch.rand(N,dtype=torch.float16, device="cpu")
except Exception as e:
print("Failed to allocate tensor X")
print(f"Error: {e}")
sys.exit(0)
try:
Y = torch.rand(N,dtype=torch.float16, device="cpu")
except Exception as e:
print("Failed to allocate tensor Y")
print(f"Error: {e}")
sys.exit(0)
# Measure execution time
print(" Starting matmul for size ", N)
start_time = time.time()
try:
B = torch.matmul(A, X) + Y # Perform matrix-vector multiplication and addition
except Exception as e:
print("Failed torch.matmul()")
print(f"Error: {e}")
sys.exit(0)
end_time = time.time()
print(" Done matmul for size ", N)
del A
del X
del Y
del B
execution_time = end_time - start_time
results[N] = execution_time
print(f"N={N}, Execution Time: {execution_time:.6f} seconds")
return results
def plot_results(results):
plt.figure(figsize=(10, 5))
plt.plot(list(results.keys()), list(results.values()), marker=‘o’, linestyle=‘-’)
plt.xlabel(‘Vector Size (N)’)
plt.ylabel(‘Execution Time (seconds)’)
plt.title(‘Execution Time vs Vector Size’)
plt.grid(True)
plt.show()
if name == “main”:
N_values = [100, 500, 1000, 5000, 10000, 50000] # Define different N values
os.environ[“OMP_NUM_THREADS”] = “1” # Reduce OpenMP threads to 1
os.environ[“MKL_NUM_THREADS”] = “1” # Reduce MKL threads (used by NumPy and PyTorch)
torch.set_num_threads(1) # Force PyTorch to use a single thread
faulthandler.enable() # Enables fault logging
results = matrix_vector_operations(N_values)