Difference between tensor.t() and tensor.T?

Given a matrix A, say:

A = torch.randn(5,5)

What is the difference between A.T and A.t()?

1 Like

From the docs:
tensor.t:

Expects input to be <= 2-D tensor and transposes dimensions 0 and 1.
0-D and 1-D tensors are returned as is. When input is a 2-D tensor this is equivalent to transpose(input, 0, 1).

tensor.T:

Returns a view of this tensor with its dimensions reversed.
If n is the number of dimensions in x, x.T is equivalent to x.permute(n-1, n-2, ..., 0).

In your use case both will yield the same result.

5 Likes

I think also it is valuable ti add that t() won’t work for dim >= 2, but transpose will. so so then it is necessary to use transpose(input, 0, 1). as ptrblck kinda wrote above

1 Like

By testing on my GPU machine, for 2-D Tensors x.T is consistently, slightly faster than x.t().

x.T is also syntactically shorter and more intuitive so it should be preferred.

Could you show your profiling code to measure this overhead?

Yes, here:

import time

import pandas as pd
import torch


def benchmark_transpose(dims_list, num_iterations=100000):
    results = []

    for dims in dims_list:
        x = torch.randn(*dims, device="cuda")
        # Warmup
        for _ in range(100):
            _ = x.T
            _ = x.t()

        # Benchmark x.T
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        for _ in range(num_iterations):
            _ = x.T
            torch.cuda.synchronize()
        end_time = time.perf_counter()
        t_property_time = (end_time - start_time) / num_iterations

        # Benchmark x.t()
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        for _ in range(num_iterations):
            _ = x.t()
            torch.cuda.synchronize()
        end_time = time.perf_counter()
        t_method_time = (end_time - start_time) / num_iterations

        results.append(
            {
                "dimensions": dims,
                "tensor.T (ms)": t_property_time * 1000,
                "tensor.t() (ms)": t_method_time * 1000,
                "size": x.nelement() * x.element_size(),
            }
        )

    return pd.DataFrame(results)


if __name__ == "__main__":
    assert torch.cuda.is_available()
    dims_to_test = [
        (5, 5),
        (55, 55),
        (555, 555),
        (5555, 5555),
        (25555, 25555),
    ]
    results_df = benchmark_transpose(dims_to_test)

    # Format and display results
    pd.set_option("display.float_format", lambda x: "{:.4f}".format(x))
    results_df["size_mb"] = results_df["size"] / (1024 * 1024)
    results_df["T/t() ratio"] = (
        results_df["tensor.T (ms)"] / results_df["tensor.t() (ms)"]
    )

    print("\nTranspose Operations Benchmark Results:")
    print("======================================")
    print(
        results_df[
            ["dimensions", "tensor.T (ms)", "tensor.t() (ms)", "size_mb", "T/t() ratio"]
        ]
    )

Local results:

Transpose Operations Benchmark Results:
======================================
       dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
0          (5, 5)         0.0080           0.0083    0.0001       0.9664
1        (55, 55)         0.0081           0.0083    0.0115       0.9656
2      (555, 555)         0.0080           0.0083    1.1750       0.9675
3    (5555, 5555)         0.0079           0.0082  117.7140       0.9619
4  (25555, 25555)         0.0080           0.0083 2491.2187       0.9648

I don’t think your benchmark shows a real difference and would claim the noise is to large when you are measuring us intervals with host timers.
E.g. if I move the syncs outside of the for loop right before stopping the host timers I get the opposite results:

# Transpose Operations Benchmark Results:
# ======================================
#        dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
# 0          (5, 5)         0.0007           0.0007    0.0001       1.0178
# 1        (55, 55)         0.0007           0.0007    0.0115       1.0174
# 2      (555, 555)         0.0007           0.0007    1.1750       1.0191
# 3    (5555, 5555)         0.0007           0.0007  117.7140       1.0181
# 4  (25555, 25555)         0.0007           0.0007 2491.2187       1.0254

Interesting when I do the same on different architectures. On my Tesla T4, I still get similar results:

Transpose Operations Benchmark Results:
======================================
       dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
0          (5, 5)         0.0013           0.0014    0.0001       0.9411
1        (55, 55)         0.0013           0.0014    0.0115       0.9168
2      (555, 555)         0.0013           0.0014    1.1750       0.9211
3    (5555, 5555)         0.0013           0.0014  117.7140       0.9319
4  (25555, 25555)         0.0013           0.0014 2491.2187       0.9386

But on an A100, results are similar to yours:

Transpose Operations Benchmark Results:
======================================
       dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
0          (5, 5)         0.0008           0.0007    0.0001       1.0309
1        (55, 55)         0.0007           0.0007    0.0115       1.0317
2      (555, 555)         0.0008           0.0007    1.1750       1.0375
3    (5555, 5555)         0.0007           0.0007  117.7140       1.0349
4  (25555, 25555)         0.0007           0.0007 2491.2187       1.0163

Anyway, then I tried the recommended torch.cuda.Event for timing, and finally got results that drew the same conclusion on both architectures: x.T is indeed (slightly) faster than x.t(); the difference is yet less significant on newer architectures.

# T4:
Transpose Operations Benchmark Results:
======================================
       dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
0          (5, 5)         1.2723           1.3940    0.0001       0.9127
1        (55, 55)         1.2873           1.3875    0.0115       0.9278
2      (555, 555)         1.2917           1.3887    1.1750       0.9302
3    (5555, 5555)         1.2803           1.3813  117.7140       0.9269
4  (25555, 25555)         1.1278           1.3752 2491.2187       0.8201

# A100:
Transpose Operations Benchmark Results:
======================================
       dimensions  tensor.T (ms)  tensor.t() (ms)   size_mb  T/t() ratio
0          (5, 5)         0.6749           0.6923    0.0001       0.9748
1        (55, 55)         0.6860           0.6945    0.0115       0.9877
2      (555, 555)         0.6919           0.6969    1.1750       0.9929
3    (5555, 5555)         0.6956           0.6996  117.7140       0.9943
4  (25555, 25555)         0.6814           0.6970 2491.2187       0.9776

Full code:

import pandas as pd
import torch


def benchmark_transpose(dims_list, num_iterations=100000):
    results = []

    for dims in dims_list:
        x = torch.randn(*dims, device="cuda")
        # Warmup
        for _ in range(100):
            _ = x.T
            _ = x.t()
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

        # Benchmark x.T
        start_event.record()
        for _ in range(num_iterations):
            _ = x.T
        end_event.record()
        torch.cuda.synchronize()
        t_property_time = start_event.elapsed_time(end_event) / num_iterations

        # Benchmark x.t()
        start_event.record()
        for _ in range(num_iterations):
            _ = x.t()
        end_event.record()
        torch.cuda.synchronize()
        t_method_time = start_event.elapsed_time(end_event) / num_iterations

        results.append(
            {
                "dimensions": dims,
                "tensor.T (ms)": t_property_time * 1000,
                "tensor.t() (ms)": t_method_time * 1000,
                "size": x.nelement() * x.element_size(),
            }
        )

    return pd.DataFrame(results)


if __name__ == "__main__":
    assert torch.cuda.is_available()
    dims_to_test = [
        (5, 5),
        (55, 55),
        (555, 555),
        (5555, 5555),
        (25555, 25555),
    ]
    results_df = benchmark_transpose(dims_to_test)

    # Format and display results
    pd.set_option("display.float_format", lambda x: "{:.4f}".format(x))
    results_df["size_mb"] = results_df["size"] / (1024 * 1024)
    results_df["T/t() ratio"] = (
        results_df["tensor.T (ms)"] / results_df["tensor.t() (ms)"]
    )

    print("\nTranspose Operations Benchmark Results:")
    print("======================================")
    print(
        results_df[
            ["dimensions", "tensor.T (ms)", "tensor.t() (ms)", "size_mb", "T/t() ratio"]
        ]
    )

Edit: typo slower → faster @ptrblck