I wrote a custom data loader but its training buckle spent a lot of time iterating over the data, loader (217 seconds per batch (64) ). I will like to know if by any chance i have committed a mistake writing the data set. In addition i used the nn.random_split for val_loader creation.
import torch
from torch.utils.data import Dataset
import pandas as pd
from constants import IPY
from CTR.cross_net.constants import trainable
class Dataset_rtb(Dataset):
def __init__(self, file_path):
self.data = pd.read_csv(file_path, sep='\t')
self.index = self.data[IPY["request_id"]].dropna()
self.data = self.data[['click'] + trainable]
def __len__(self):
return len(self.data)
def __getitem__(self, index):
# load image as ndarray type (Height * Width * Channels)
# be carefull for converting dtype to np.uint8 [Unsigned integer (0 to 255)]
# in this example, i don't use ToTensor() method of torchvision.transforms
# so you can convert numpy ndarray shape to tensor in PyTorch (H, W, C) --> (C, H, W)
features = []
#row_cont = self.data[["slotwidth", "slotheight"]].iloc[index]
row = self.data[trainable].iloc[index]
for i in range(len(trainable)):
one_hot_tensor = torch.tensor(int(row[i]))
features.append(one_hot_tensor)
#features.append(torch.tensor(row_cont))
label = torch.tensor(self.data['click'].iloc[index])
return features, label```