I’m training with a dataset about 80G(12,000,000 images; 5,000 categories). I use DataLoader to load the training data.
The loading process is very fast at first(0.3s, CPU 30%, RAM 10% of 30G), but some iterations later, it becomes very slow(6s, CPU 5%, RAM 10% of 30G).
I set the CPU mode to performance, and try different num_workers(0,4,8,12 …), but the situation stays the same.
I really don’t know how to solve this headache problem. What’s the best practice to load large datasets in Pytorch?
Here is my code:
import os
import torch
import pandas as pd
import cv2
import numpy as np
import time
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
class xiongDataset(Dataset):
def __init__(self, csv_file, root_dir, transform=None):
self.train_names=[]
self.root_dir=root_dir
self.transform = transform
train_images = pd.read_csv(csv_file)
train_ids = list(train_images['product_id'])
train_idxs = list(train_images['img_idx'])
self.labels = list(train_images['category_idx'])
num_train = len(train_images)
for i in range(num_train):
train_name = '{}_{}.jpg'.format(train_ids[i],train_idxs[i])
self.train_names.append(train_name)
def __len__(self):
return len(self.train_names)
def __getitem__(self, idx):
img = cv2.imread(self.root_dir + self.train_names[idx])
label = self.labels[idx]
if self.transform is not None:
img = self.transform(img)
return img,label
train_data = xiongDataset('../train_images.csv','../train/',transform=transforms.ToTensor())
data_loader= DataLoader(train_data,batch_size=256,shuffle=False,num_workers=0)