# Why bfloat16 matmul is significantly slower than float32?

I am figuring out how should I use bfloat16 or float32 on CPU training and inference.

That’s the code I use to test.

``````import torch
import time

tensor_size = (1000, 1000)
num_iterations = 100

def perform_operations(data):
start_time = time.time()
squared = torch.square(data)
squared_time = time.time() - start_time

start_time = time.time()
summed = torch.sum(data)
sum_time = time.time() - start_time

start_time = time.time()
averaged = torch.mean(data)
mean_time = time.time() - start_time

start_time = time.time()
transpose = data.t()
transpose_time = time.time() - start_time

start_time = time.time()
matmul_result = torch.matmul(data, transpose)
matmul_time = time.time() - start_time
return squared_time, sum_time, mean_time, transpose_time, matmul_time

bfloat16_times = [0.0] * 5
float32_times = [0.0] * 5

for i in range(num_iterations):
data = torch.randn(tensor_size, dtype=torch.float32)
data_bfloat16 = data.to(torch.bfloat16)

bfloat16_op_times = perform_operations(data_bfloat16)
for j in range(5):
bfloat16_times[j] += bfloat16_op_times[j]

float32_op_times = perform_operations(data)
for j in range(5):
float32_times[j] += float32_op_times[j]

bfloat16_times = [time / num_iterations for time in bfloat16_times]
float32_times = [time / num_iterations for time in float32_times]

print("Average time for bfloat16:")
print("Squared: {:.6f} seconds".format(bfloat16_times[0]))
print("Sum: {:.6f} seconds".format(bfloat16_times[1]))
print("Mean: {:.6f} seconds".format(bfloat16_times[2]))
print("Transpose: {:.6f} seconds".format(bfloat16_times[3]))
print("Matmul: {:.6f} seconds".format(bfloat16_times[4]))
print()

print("Average time for float32:")
print("Squared: {:.6f} seconds".format(float32_times[0]))
print("Sum: {:.6f} seconds".format(float32_times[1]))
print("Mean: {:.6f} seconds".format(float32_times[2]))
print("Transpose: {:.6f} seconds".format(float32_times[3]))
print("Matmul: {:.6f} seconds".format(float32_times[4]))
``````

And I got the result:

Average time for bfloat16:
Squared: 0.000355 seconds
Sum: 0.000183 seconds
Mean: 0.000237 seconds
Transpose: 0.000027 seconds
Matmul: 0.299079 seconds

Average time for float32:
Squared: 0.000585 seconds
Sum: 0.000239 seconds
Mean: 0.000123 seconds
Transpose: 0.000010 seconds
Matmul: 0.005496 seconds

I wonder why bfloat16 faster than float32 in all other operations except torch.matmul().
The code runs on an intel 9th-gen core i7 cpu.