Segmentation fault in 50K x 50K torch.matmul (fixed)

A basic matmul scenario …

I get a segmentation fault in torch.matmul() when multiplying a tensor of 50K x 50K with a vector of length 50K. Float16 dtype. My machine has plenty of free memory (75G with 66G free currently) and this tensor is only about 5G in total. The tensor is purely on the cpu (not gpu). Tried running with OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 .. no difference. Couldnt get much info out of gdb (output from running within gdb below)

Thread 1 "python" received signal SIGSEGV, Segmentation fault. 0x00007fffdf074198 in at::native::AVX2::fp16_dot_with_fp32_arith(c10::Half const*, c10::Half const*, long) () from /home/cloud-user/BM/lib64/python3.12/site-packages/torch/lib/libtorch_cpu.so Missing separate debuginfos, use: dnf debuginfo-install bzip2-libs-1.0.8-8.el9.x86_64 expat-2.5.0-2.el9_4.x86_64 glibc-2.34-100.el9.x86_64 libffi-3.4.2-8.el9.x86_64

And the program itself is copied below … works fine until N=10000 and then segmentation fault in torch.matmul() for N = 50000

import torch
import time
import faulthandler
import os

import matplotlib.pyplot as plt

def matrix_vector_operations(N_values):
results = {}
for N in N_values:
# Initialize tensors
try:
A = torch.rand(N, N, dtype=torch.float16, device=“cpu”)
print(" Allocated matrix A for size “, N)
except Exception as e:
print(“Failed to allocate tensor A”)
print(f"Error: {e}”)
sys.exit(0)

    try:
        X = torch.rand(N,dtype=torch.float16, device="cpu")
    except Exception as e:
        print("Failed to allocate tensor X")
        print(f"Error: {e}")
        sys.exit(0)

    try:
        Y = torch.rand(N,dtype=torch.float16, device="cpu")
    except Exception as e:
        print("Failed to allocate tensor Y")
        print(f"Error: {e}")
        sys.exit(0)

    # Measure execution time
    print(" Starting matmul for size  ", N)
    start_time = time.time()
    try:
        B = torch.matmul(A, X) + Y     # Perform matrix-vector multiplication and addition
    except Exception as e:
        print("Failed torch.matmul()")
        print(f"Error: {e}")
        sys.exit(0)
    end_time = time.time()
    print(" Done matmul for size  ", N)
    del A
    del X
    del Y
    del B

    execution_time = end_time - start_time
    results[N] = execution_time
    print(f"N={N}, Execution Time: {execution_time:.6f} seconds")

return results

def plot_results(results):
plt.figure(figsize=(10, 5))
plt.plot(list(results.keys()), list(results.values()), marker=‘o’, linestyle=‘-’)
plt.xlabel(‘Vector Size (N)’)
plt.ylabel(‘Execution Time (seconds)’)
plt.title(‘Execution Time vs Vector Size’)
plt.grid(True)
plt.show()

if name == “main”:
N_values = [100, 500, 1000, 5000, 10000, 50000] # Define different N values
os.environ[“OMP_NUM_THREADS”] = “1” # Reduce OpenMP threads to 1
os.environ[“MKL_NUM_THREADS”] = “1” # Reduce MKL threads (used by NumPy and PyTorch)
torch.set_num_threads(1) # Force PyTorch to use a single thread

faulthandler.enable()  # Enables fault logging
results = matrix_vector_operations(N_values)

plot_results(results)

Fixed .. beginner’s error .. dynamic range with float16 is going to overflow on a 50K matmul