Difference between tensor.t() and tensor.T?

Interesting when I do the same on different architectures. On my Tesla T4, I still get similar results:

Transpose Operations Benchmark Results:
======================================
       dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
0          (5, 5)         0.0013           0.0014    0.0001       0.9411
1        (55, 55)         0.0013           0.0014    0.0115       0.9168
2      (555, 555)         0.0013           0.0014    1.1750       0.9211
3    (5555, 5555)         0.0013           0.0014  117.7140       0.9319
4  (25555, 25555)         0.0013           0.0014 2491.2187       0.9386

But on an A100, results are similar to yours:

Transpose Operations Benchmark Results:
======================================
       dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
0          (5, 5)         0.0008           0.0007    0.0001       1.0309
1        (55, 55)         0.0007           0.0007    0.0115       1.0317
2      (555, 555)         0.0008           0.0007    1.1750       1.0375
3    (5555, 5555)         0.0007           0.0007  117.7140       1.0349
4  (25555, 25555)         0.0007           0.0007 2491.2187       1.0163

Anyway, then I tried the recommended torch.cuda.Event for timing, and finally got results that drew the same conclusion on both architectures: x.T is indeed (slightly) faster than x.t(); the difference is yet less significant on newer architectures.

# T4:
Transpose Operations Benchmark Results:
======================================
       dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
0          (5, 5)         1.2723           1.3940    0.0001       0.9127
1        (55, 55)         1.2873           1.3875    0.0115       0.9278
2      (555, 555)         1.2917           1.3887    1.1750       0.9302
3    (5555, 5555)         1.2803           1.3813  117.7140       0.9269
4  (25555, 25555)         1.1278           1.3752 2491.2187       0.8201

# A100:
Transpose Operations Benchmark Results:
======================================
       dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
0          (5, 5)         0.6749           0.6923    0.0001       0.9748
1        (55, 55)         0.6860           0.6945    0.0115       0.9877
2      (555, 555)         0.6919           0.6969    1.1750       0.9929
3    (5555, 5555)         0.6956           0.6996  117.7140       0.9943
4  (25555, 25555)         0.6814           0.6970 2491.2187       0.9776

Full code:

import pandas as pd
import torch


def benchmark_transpose(dims_list, num_iterations=100000):
    results = []

    for dims in dims_list:
        x = torch.randn(*dims, device="cuda")
        # Warmup
        for _ in range(100):
            _ = x.T
            _ = x.t()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        # Benchmark x.T
        start_event.record()
        for _ in range(num_iterations):
            _ = x.T
        end_event.record()
        torch.cuda.synchronize()
        t_property_time = start_event.elapsed_time(end_event) / num_iterations

        # Benchmark x.t()
        start_event.record()
        for _ in range(num_iterations):
            _ = x.t()
        end_event.record()
        torch.cuda.synchronize()
        t_method_time = start_event.elapsed_time(end_event) / num_iterations

        results.append(
            {
                "dimensions": dims,
                "tensor.T (ms)": t_property_time * 1000,
                "tensor.t() (ms)": t_method_time * 1000,
                "size": x.nelement() * x.element_size(),
            }
        )

    return pd.DataFrame(results)


if __name__ == "__main__":
    assert torch.cuda.is_available()
    dims_to_test = [
        (5, 5),
        (55, 55),
        (555, 555),
        (5555, 5555),
        (25555, 25555),
    ]
    results_df = benchmark_transpose(dims_to_test)

    # Format and display results
    pd.set_option("display.float_format", lambda x: "{:.4f}".format(x))
    results_df["size_mb"] = results_df["size"] / (1024 * 1024)
    results_df["T/t() ratio"] = (
        results_df["tensor.T (ms)"] / results_df["tensor.t() (ms)"]
    )

    print("\nTranspose Operations Benchmark Results:")
    print("======================================")
    print(
        results_df[
            ["dimensions", "tensor.T (ms)", "tensor.t() (ms)", "size_mb", "T/t() ratio"]
        ]
    )

Edit: typo slower → faster @ptrblck