Dataloader using chunk feature of read_csv - for memory efficient datalodaing

my network is running slow and i suspect it’s got to do with the loading scheme, which is based on tutorials and simply loads all data to the dataset object during initialization.

am i right to think that it might help to load a chunk of data at each time, iterate through this chunk, update weights and then discard the chunk from memory and start again with another chunk?

i found [How to use dataset larger than memory?] (this link) to be close to what i’m looking for but it ended with no straight up solution.

my current dataest class, shown below, is initialized by loading the entire dataset and performing some procedures on it.
storing the entire preprocessed data as a variable of the class instance.
the get item method receives a random idx and returns the sample (input and target) after some more transformations that i apply manually (in the getitem method).

i’m not sure how to continue from here because if i set read_csv to “chunk mode” it returns an iterator. this way how do i perform preprocessing in the initialization?
i could preprocess in the getitem method, but then get item would get a chunk of data instead of a single sample, how can i pass that?

i’m adding my current dataset for your consideration. generally any advice would be appreciated but what i couldn’t, and would love to find is a sipmle example for my case: full implementation of loading chunks to a dataset object and configuring it with a dataloader object and all the way to a training loop.
class FacialKeypoints(Dataset):

def __init__(self, test=False, cols=None,FTRAIN = 'data/Q3/training.csv', FTEST = 'EX1/Q3/test.csv', transform_vars=None, batch_size = 16):
    fname = FTEST if test else FTRAIN
    df = read_csv(os.path.expanduser(fname))  # load pandas dataframe

    # The Image column has pixel values separated by space; convert
    # the values to numpy arrays:
    df['Image'] = df['Image'].apply(lambda im: np.fromstring(im, sep=' '))

    print('number of values in each column: ', df.count())  # prints the number of values for each column
    df = df.dropna()  # drop all rows that have missing values in them
    X = np.vstack(df['Image'].values) / 255.  # scale pixel values to [0, 1]
    X = X.astype(np.float32)
    image_size = int(np.sqrt(X.shape[1]))
    Y = []
    if not test:  # only FTRAIN has any target columns
        y = df[df.columns[:-1]].values
        y2 = y.reshape(y.shape[0],15,2)
        for coords in y2:
            mask = np.zeros((image_size,image_size))
            for pair in coords:
                pair = pair.round().astype(int)
                mask[pair[1] - 1, pair[0] - 1] += 1
                if mask[pair[1] - 1, pair[0] - 1]==3:
        Y = np.array(Y)
        y = (y - 48) / 48  # scale target coordinates to [-1, 1]
        X, y, Y = shuffle(X, y, Y, random_state=42)  # shuffle train data
        y = y.astype(np.float32)
        y = None

    self.X = torch.tensor(X,dtype=torch.float32)
    self.transform_vars = transform_vars
    self.y = torch.tensor(y)
    self.Y = torch.tensor(Y,dtype=torch.float32)
    print('finished loading')
def __len__(self):
    return len(self.X)

def transform(self,image, mask):
    image = image.reshape(96,96)
    flip_prob = self.transform_vars['flip_probability']
    rotate_prob = self.transform_vars['rotate_probability']

    if torch.rand(1)>flip_prob:
        image = TF.hflip(image)
        mask_points = torch.nonzero(mask, as_tuple=False)
        newmask = torch.zeros((96, 96))
        for pair in mask_points:
            newpair = flip(pair)
            newmask[newpair[0] - 1, newpair[1] - 1] = mask[pair[0],pair[1]]
        mask = newmask
        #mask = TF.hflip(mask)
    if torch.rand(1)<rotate_prob:
        avg_pixel = image.mean()
        degrees = self.transform_vars['degrees']
        deg = int(torch.rand(1).item() * degrees - degrees)
        image_r = TF.to_tensor(TF.rotate(TF.to_pil_image(image),deg)).squeeze()
        image_r[(image_r==0) * (image!=0)] = avg_pixel
        image = image_r
        mask_points = torch.nonzero(mask,as_tuple=False)
        newmask = torch.zeros((96,96))
        for pair in mask_points:
            newpair = tilt(pair,deg)
            newmask[newpair[0] - 1,newpair[1] - 1] = mask[pair[0],pair[1]]
        mask = newmask
        #mask = TF.to_pil_image(mask)
        #mask = TF.rotate(mask, deg,resample=PIL.Image.NEAREST)
        #mask = TF.to_tensor(mask).squeeze()
        #mask = TF.to_tensor(TF.rotate(TF.to_pil_image(mask), deg)).squeeze()
    return image.unsqueeze(0), mask

def update_target(self,mask):
    keypoints = (mask==1).nonzero(as_tuple=False).reshape(-1)
    keypoints = torch.hstack([keypoints,(mask==2).nonzero(as_tuple=False).reshape(-1).repeat(2)])
    keypoints = torch.from_numpy((keypoints.numpy() - 48) / 48)
    if keypoints.shape[0]!=30:
        print('bad after transform')
        temp = self.__getitem__(torch.randint(0,len(self),[1]).item())
        return temp['keypoints']
        return keypoints 

def __getitem__(self, idx):
    self.idx = idx
    if torch.is_tensor(idx):
        idx = idx.tolist()
    image = self.X[idx]
    keypoints = self.y[idx]
    mask = self.Y[idx]
    if self.transform_vars['is']:
        image, mask = self.transform(image, mask)
        keypoints = self.update_target(mask).to(dtype = torch.float32)
        return {'image':image, 'keypoints':keypoints}
        return {'image':image,'keypoints':keypoints}


i figured out how to use the chunk loader feature of pd.read_csv, but ran into difficulties since the iterator object (returned by read_csv with chunksize argument) can only draw samples at a fixed order (and i want the order to be shuffled after each epoch)

i found a way to bypass that, but i’m afraid it is still very slow. my new approach:

  1. changed the sampler to a custom sampler such that each time getitem is called, it takes in a batch of indices

  2. the getitem function takes the list of indices generated by the sampler and utilizes the skiprows feature of pd.read_csv.

so, to be clear, not using chunk loader. simply call read_csv everytime i draw a batch of samples and skip all rows but the rows of the batch.

i’m adding the code here, if anyone has a better solution please let me know (:slight_smile:

import torch
import torch.nn as nn
from import DataLoader
from preprocess import FacialKeypoints
import numpy as np
from import Subset
from sklearn.model_selection import train_test_split as splitter
from helper_funcs import pixel_distance

transformed_dataset = FacialKeypoints(transform_vars={'is':True,'degrees':10,'flip_probability':0.5,'rotate_probability':0.5})
validation_dataset = FacialKeypoints(transform_vars={'is':False})
n = len(transformed_dataset)
num_train = int(np.ceil(len(transformed_dataset) * 0.85))
num_val = int(len(transformed_dataset) - num_train)

train_idx, val_idx = splitter(np.arange(n),train_size=num_train,shuffle=True)
batch_size = 16

trainset = Subset(transformed_dataset,train_idx)
valset = Subset(validation_dataset,val_idx)

train_sampler =, batch_size=batch_size,drop_last=False)
val_sampler =, batch_size=batch_size,drop_last=False)

trainloader = DataLoader(trainset, sampler=train_sampler,num_workers=0)
valoader = DataLoader(valset, sampler=val_sampler, num_workers=0)

if torch.cuda.is_available():
    device = torch.device('cuda:0')

model2 = nn.Sequential(
total_loss = {'train':[],'val':[]}
criterion2 = nn.MSELoss()
optimizer2 = torch.optim.Adam(model2.parameters(),lr=0.001)

total_loss = {'train':[],'val':[]}
for epoch in range(10):
    pixel_dist = []
    print('in epoch {}/100 :'.format(epoch+1))
    for sample in trainloader:
        losses = []
        input = sample['image'].squeeze(0).to(dtype = torch.float,device = device)
        batch = input.shape[0]
        target = sample['keypoints'].squeeze(0).to(dtype = torch.float,device = device)
        output = model2(input)
        pixel_dist.append(pixel_distance(output, target))
        loss2 = criterion2(output,target)
    a = np.mean(losses)
    print('train loss = {}'.format(a))
    print('avg training pixel distance = {}'.format(np.mean(pixel_dist)))
    pixel_dist = []
    for sample in valoader:
        with torch.no_grad():
            losses = []
            input = sample['image'].squeeze(0).to(dtype = torch.float,device = device)
            batch = input.shape[0]
            target = sample['keypoints'].squeeze(0).to(dtype = torch.float,device = device)
            output = model2(input)
            loss2 = criterion2(output, target)
    a = np.mean(losses)
    print('validation loss = {}'.format(a))
    print('avg validatioin pixel distance = {}'.format(np.mean(pixel_dist)))

'''def check_sample(loader=valoader,model=model2,device=device):
device2 = torch.device('cpu')
plots = 16//3
x = next(iter(loader))
y_true = x['keypoints']
y_true = y_true.reshape(16,15,2)
x = x['image'].to(device)
x = x.view(16,1,96,96)
y = model(x)
y = y.reshape(16,15,2).to(device2)
x =

fig,ax = plt.subplots(3,plots)
for i in range(plots):
    for j in range(3):

def __getitem__(self, idx):
    self.idx = idx
    if torch.is_tensor(idx):
        idx = idx.tolist()
    image = torch.from_numpy(np.array(read_csv('data/Q3/training_images.csv',header=None,
    keypoints = torch.from_numpy(np.array(read_csv('data/Q3/training_labels.csv',header=None,
    if self.transform_vars['is']:
        image, keypoints = self.transform(image, keypoints)
        return {'image':image, 'keypoints':keypoints}
        image = image.reshape((-1,96, 96))
        return {'image':image.unsqueeze(1),'keypoints':keypoints}