Interesting when I do the same on different architectures. On my Tesla T4, I still get similar results:
Transpose Operations Benchmark Results:
======================================
dimensions tensor.T (ms) tensor.t() (ms) size_mb T/t() ratio
0 (5, 5) 0.0013 0.0014 0.0001 0.9411
1 (55, 55) 0.0013 0.0014 0.0115 0.9168
2 (555, 555) 0.0013 0.0014 1.1750 0.9211
3 (5555, 5555) 0.0013 0.0014 117.7140 0.9319
4 (25555, 25555) 0.0013 0.0014 2491.2187 0.9386
But on an A100, results are similar to yours:
Transpose Operations Benchmark Results:
======================================
dimensions tensor.T (ms) tensor.t() (ms) size_mb T/t() ratio
0 (5, 5) 0.0008 0.0007 0.0001 1.0309
1 (55, 55) 0.0007 0.0007 0.0115 1.0317
2 (555, 555) 0.0008 0.0007 1.1750 1.0375
3 (5555, 5555) 0.0007 0.0007 117.7140 1.0349
4 (25555, 25555) 0.0007 0.0007 2491.2187 1.0163
Anyway, then I tried the recommended torch.cuda.Event
for timing, and finally got results that drew the same conclusion on both architectures: x.T
is indeed (slightly) faster than x.t()
; the difference is yet less significant on newer architectures.
# T4:
Transpose Operations Benchmark Results:
======================================
dimensions tensor.T (ms) tensor.t() (ms) size_mb T/t() ratio
0 (5, 5) 1.2723 1.3940 0.0001 0.9127
1 (55, 55) 1.2873 1.3875 0.0115 0.9278
2 (555, 555) 1.2917 1.3887 1.1750 0.9302
3 (5555, 5555) 1.2803 1.3813 117.7140 0.9269
4 (25555, 25555) 1.1278 1.3752 2491.2187 0.8201
# A100:
Transpose Operations Benchmark Results:
======================================
dimensions tensor.T (ms) tensor.t() (ms) size_mb T/t() ratio
0 (5, 5) 0.6749 0.6923 0.0001 0.9748
1 (55, 55) 0.6860 0.6945 0.0115 0.9877
2 (555, 555) 0.6919 0.6969 1.1750 0.9929
3 (5555, 5555) 0.6956 0.6996 117.7140 0.9943
4 (25555, 25555) 0.6814 0.6970 2491.2187 0.9776
Full code:
import pandas as pd
import torch
def benchmark_transpose(dims_list, num_iterations=100000):
results = []
for dims in dims_list:
x = torch.randn(*dims, device="cuda")
# Warmup
for _ in range(100):
_ = x.T
_ = x.t()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
# Benchmark x.T
start_event.record()
for _ in range(num_iterations):
_ = x.T
end_event.record()
torch.cuda.synchronize()
t_property_time = start_event.elapsed_time(end_event) / num_iterations
# Benchmark x.t()
start_event.record()
for _ in range(num_iterations):
_ = x.t()
end_event.record()
torch.cuda.synchronize()
t_method_time = start_event.elapsed_time(end_event) / num_iterations
results.append(
{
"dimensions": dims,
"tensor.T (ms)": t_property_time * 1000,
"tensor.t() (ms)": t_method_time * 1000,
"size": x.nelement() * x.element_size(),
}
)
return pd.DataFrame(results)
if __name__ == "__main__":
assert torch.cuda.is_available()
dims_to_test = [
(5, 5),
(55, 55),
(555, 555),
(5555, 5555),
(25555, 25555),
]
results_df = benchmark_transpose(dims_to_test)
# Format and display results
pd.set_option("display.float_format", lambda x: "{:.4f}".format(x))
results_df["size_mb"] = results_df["size"] / (1024 * 1024)
results_df["T/t() ratio"] = (
results_df["tensor.T (ms)"] / results_df["tensor.t() (ms)"]
)
print("\nTranspose Operations Benchmark Results:")
print("======================================")
print(
results_df[
["dimensions", "tensor.T (ms)", "tensor.t() (ms)", "size_mb", "T/t() ratio"]
]
)
Edit: typo slower → faster @ptrblck