Question about tensor assign time

Why does unpack_time in the code snippet below take so long?

import time
import torch


class MyDataloader:
    def __init__(self, shape):
        self.shape = shape
        self.tensor = torch.randn(shape, dtype=torch.float32)
        self.idx = 0
        self.num = 5
        self.batch_idxs = torch.randint(0, self.shape[0], size=(self.shape[0],), dtype=torch.int64)
        # self.batch_idxs = torch.arange(self.shape[0], dtype=torch.int64)
        self.results = []

    def __iter__(self):
        return self

    def __next__(self):
        if self.idx < self.num:
            self.idx += 1
            result = [self.tensor[self.batch_idxs[i * 10000:(i + 1) * 10000]] for i in
                      range(5)] 
            # self.results.append(result) # if we keep the reference of result, the used time will decrease a lot
            return result
        else:
            raise StopIteration


if __name__ == "__main__":
    # torch.set_num_threads(1)
    process_start_time = time.time()
    my_dataloader = MyDataloader((100000, 10000))
    init_dataloader_end_time = time.time()
    loader_start_time = time.time()

    for x in my_dataloader:
        unpack_start = time.time()
        t = x  # cost time hear
        # a, b, c, d, e = x # similar to last line
        unpack_end = time.time()
        print("loader_batch_time {:.8f} s | unpack_time {:.8f} s".format(unpack_start - loader_start_time,
                                                                         unpack_end - unpack_start))
        loader_start_time = time.time()

    print("process dataloader time is {:.4f} s".format(time.time() - init_dataloader_end_time))
    print("process time is {:.4f} s".format(time.time() - process_start_time))

code print log:

loader_batch_time 0.40162539 s | unpack_time 0.00000095 s
loader_batch_time 0.34314251 s | unpack_time 0.12212968 s
loader_batch_time 0.31168938 s | unpack_time 0.11875153 s
loader_batch_time 0.30921507 s | unpack_time 0.11288309 s
loader_batch_time 0.28784609 s | unpack_time 0.12381434 s
process dataloader time is 2.1312 s
process time is 8.1604 s

As we can see, unpack_time takes a long time. Why is this?

In addition, the unpack_time of the first iteration is very short, and it seems that gc is performed in each subsequent cycle.
If gc is performed, is this time cost normal? Are there any optimization methods to reduce this time?

Thanks for any reply!