I’m practicing using torch.utils.data.DataLoader to load batches of data into GPU, but find it’s significantly slower than just indexing the data, like 10 times slower. I wonder why or there is something I did wrong.

Here is my code:

```
for train_index, test_index in kf.split(X, Y):
# Split train-test
X_train,X_test=X[train_index],X[test_index]
y_train,y_test=y[train_index],y[test_index]
train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True)
model = NNTrain()
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
model.train()
for i in range(epochs):
for X_batch, y_batch in train_dataloader:
X_batch = X_batch.float().cuda()
y_batch = y_batch.float().cuda()
optimizer.zero_grad()
y_batch_pred = model(X_batch)
loss = nn.MSELoss()(y_batch_pred, y_batch)
loss.backward()
optimizer.step()
```

and the faster one:

```
for train_index, test_index in kf.split(X, Y):
# Split train-test
X_train,X_test=X[train_index],X[test_index]
y_train,y_test=y[train_index],y[test_index]
model = NNTrain()
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
model.train()
for i in range(epochs):
for j in range(batch_num):
X_batch = torch.from_numpy(X_train[batch_size*j:batch_size*(j+1),:]).float().cuda()
y_batch = torch.from_numpy(y_train[batch_size*j:batch_size*(j+1),:]).float().cuda()
y_batch_pred = model(X_batch)
loss = nn.MSELoss()(y_batch_pred, y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
```