Using torch.stack
on a list of tensors appear to be faster than preallocating the tensor and then writing into it.
import torch
def preallocate(*shape, seed=0):
torch.manual_seed(seed)
x = torch.empty(shape)
for i in range(shape[0]):
torch.rand(shape[1:], out=x[i])
return x
def stacker(*shape, seed=0):
torch.manual_seed(seed)
x = []
for _ in range(shape[0]):
x.append(torch.rand(shape[1:]))
return torch.stack(x)
if __name__ == "__main__":
import timeit
import random
seed=random.randint(-0x8000_0000_0000_0000, 0xffff_ffff_ffff_ffff)
print(timeit.timeit(f"preallocate(100,1000, seed={seed})",
"from __main__ import preallocate", number=10000))
print(timeit.timeit(f"stacker(100,1000, seed={seed})",
"from __main__ import stacker", number=10000))
The function which uses torch.stack
is 4–5 s faster on my machine after a few runs.
This is quite unexpected since I assume calling torch.stack
would copy all the tensors into one big tensor, whereas preallocation would not have such an overhead. What would even make preallocation slower than torch.stack
?