Thanks for letting me know about torch.utils.benchmark
. I constructed a new benchmark with this. I also adjusted the settings so that the convolution should be deterministic.
import torch
import torch.utils.benchmark as benchmark
from itertools import product
class CNN(torch.nn.Module):
def __init__(
self,in_dim, out_dim,kernel_size,
):
super().__init__()
self.conv_1 = torch.nn.Conv1d(in_dim, out_dim, kernel_size)
def forward(self, x):
return self.conv_1(x)
def test_cnn(x,cnn):
return cnn(x).sum().backward()
torch.backends.cudnn.benchmark = False
torch.set_deterministic(True)
device = "cuda:0"
x1 = torch.randn(256,77,126).to(device)
x2 = torch.randn(211,77,137).to(device)
x3 = torch.randn(512,77,221).to(device)
out_dims = [50, 80, 100, 120, 150]
kernel_sizes = [2, 3, 4]
results = []
for out_dim, kernel_size in product(out_dims, kernel_sizes):
sub_label = f"[77, {out_dim}, {kernel_size}]"
cnn = CNN(77, out_dim, kernel_size).to(device)
for num_threads in [1, 4]:
results.append(benchmark.Timer(
stmt="test_cnn(x,cnn)",
setup="from __main__ import test_cnn",
globals={"x": x1, "cnn": cnn},
num_threads=num_threads,
sub_label=sub_label,
description="x1"
).blocked_autorange(min_run_time=1))
results.append(benchmark.Timer(
stmt="test_cnn(x,cnn)",
setup="from __main__ import test_cnn",
globals={"x": x2, "cnn": cnn},
num_threads=num_threads,
sub_label=sub_label,
description="x2"
).blocked_autorange(min_run_time=1))
results.append(benchmark.Timer(
stmt="test_cnn(x,cnn)",
setup="from __main__ import test_cnn",
globals={"x": x3, "cnn": cnn},
num_threads=num_threads,
sub_label=sub_label,
description="x3"
).blocked_autorange(min_run_time=1))
compare = benchmark.Compare(results)
compare.print()
Here is the the output that I get
[-------------------- -------------------]
| x1 | x2 | x3
1 threads: --------------------------------
[77, 50, 2] | 4.7 | 4.2 | 16.7
[77, 50, 3] | 4.7 | 4.1 | 16.6
[77, 50, 4] | 4.5 | 4.0 | 15.9
[77, 80, 2] | 4.4 | 3.9 | 15.6
[77, 80, 3] | 4.3 | 3.8 | 15.4
[77, 80, 4] | 4.3 | 3.8 | 14.9
[77, 100, 2] | 3.5 | 3.2 | 12.1
[77, 100, 3] | 3.5 | 3.2 | 12.3
[77, 100, 4] | 3.6 | 3.2 | 12.4
[77, 120, 2] | 3.6 | 3.2 | 12.3
[77, 120, 3] | 3.6 | 3.2 | 12.4
[77, 120, 4] | 3.6 | 3.2 | 12.5
[77, 150, 2] | 5.1 | 4.5 | 18.1
[77, 150, 3] | 5.0 | 4.5 | 18.0
[77, 150, 4] | 4.9 | 4.4 | 17.7
4 threads: --------------------------------
[77, 50, 2] | 4.7 | 4.2 | 16.7
[77, 50, 3] | 4.7 | 4.1 | 16.6
[77, 50, 4] | 4.5 | 4.0 | 15.9
[77, 80, 2] | 4.4 | 3.9 | 15.6
[77, 80, 3] | 4.3 | 3.8 | 15.4
[77, 80, 4] | 4.3 | 3.8 | 14.9
[77, 100, 2] | 3.5 | 3.2 | 12.1
[77, 100, 3] | 3.5 | 3.2 | 12.3
[77, 100, 4] | 3.6 | 3.2 | 12.4
[77, 120, 2] | 3.6 | 3.2 | 12.3
[77, 120, 3] | 3.6 | 3.2 | 12.4
[77, 120, 4] | 3.6 | 3.2 | 12.5
[77, 150, 2] | 5.1 | 4.5 | 18.1
[77, 150, 3] | 5.0 | 4.5 | 18.0
[77, 150, 4] | 4.9 | 4.4 | 17.7
Times are in milliseconds (ms).
I find these results quite confusing. I was expecting for the runtime to increase with the size of out_channels
. However, the results indicate that there is some kind of sweet spot around 100/120. Also, for out_channels
100 and 120, an increase in the kernel_size
leads to an increase in runtime. While for the other out_channels
sizes the runtime decreases as kernel_size
increases. Is there some explanation for these results?