Data Loader is slow compared with custom batching.

Why “run_custom_batch” is 20 times faster than “run_pytorch_batch”?

from time import time
import torch
from torch import nn
import as data_utils

class Controller(torch.nn.Module):

    def __init__(self, n=20):
        super(Controller, self).__init__() = nn.Sequential(
            nn.Linear(12, n),
            nn.Linear(n, 4),

    def forward(self, Z):

def run_custom_batch(n_epoch, n, batch_size):
    print(f"running custom_batch, n = {n}, batch_size = {batch_size}")
    controller = Controller()

    Z = torch.rand(n, 12, dtype=torch.float32)

    for i in range(n_epoch):
        t0 = time()

        i1, i2 = 0, batch_size
        while i2 < n:
            res = controller(Z[i1:i2])
            i1, i2 = i2, i2 + batch_size
        res = controller(Z[i1:n])
        print(f"i = {i}, time={time() - t0}")

def run_pytorch_batch(n_epoch, n, batch_size):
    print(f"running pytorch_batch, n = {n}, batch_size = {batch_size}")
    controller = Controller()

    Z = torch.rand(n, 12, dtype=torch.float32)

    train = data_utils.TensorDataset(Z)
    train_loader = data_utils.DataLoader(train, batch_size=batch_size, shuffle=False)

    for i in range(n_epoch):
        t0 = time()
        for Z in train_loader:
            res = controller(Z[0])

        print(f"i = {i}, time={time() - t0}")

if __name__ == '__main__':
    batch_size = 1024
    run_custom_batch(n_epoch=5, n=300000, batch_size=batch_size)
    i = 0, time=0.09275412559509277
    i = 1, time=0.07978677749633789
    i = 2, time=0.07679605484008789
    i = 3, time=0.07879066467285156
    i = 4, time=0.07879066467285156
    run_pytorch_batch(n_epoch=5, n=300000, batch_size=batch_size)
    i = 0, time=1.7543346881866455
    i = 1, time=1.7942287921905518
    i = 2, time=1.8034889698028564
    i = 3, time=1.8829929828643799
    i = 4, time=1.836116075515747

I would assume that simple slicing is faster than calling into the __getitem__ method of your Dataset and stacking the samples to a tensor.
The slowdown of 20x is quite large and I wouldn’t expect it. However, did you try different number of workers for your DataLoader approach and compared it to the manual slicing approach?

The main advantage of using the DataLoader is e.g., that you can use multiple processes to prefetch the next batch, while your main training loop is busy with the model training.
Also, you can easily shuffle, provide a custom sampler, a custom collate_fn, use pinned memory, and can easily implement lazy loading of the samples in your Dataset.

If you can store the data in memory and don’t need the advantages of the Dataset and DataLoader, your approach seems completely fine. :slight_smile:

Thank you, @ptrblck.
The different number of workers do not help. I think because all the data already in memory.