Pytorch Dataset _getitem__ explosion

Scott_Hoang · April 1, 2019, 12:59am

Hey guys, I need some help with a bizarre question. I have a standard Dataset to load in my data of 4297 instances. when I cal len(dataset) I get 4297. However, when I loop over my dataset instances, I get an index out of bound error. Out of curiosity, I commented out all my getitem code to see how far it would go, and it never stops. I need help!
here is my code:

import os 
import pandas as pd 

import torch
import cv2
import numpy as np
from torch.utils.data import Dataset
from torch import nn, optim
from torch.utils.data import DataLoader

class VideoDataset(Dataset):

    def __init__(self, dataset='iemocap', clip_len=16, leave_one_out=None,emotions=None):

        self.root_dir = 'data'
        self.df = os.path.join('data','iemocap.csv')
        self.clip_len = clip_len
        self.resize_height = 128
        self.resize_width = 171
        self.crop_size = 112

        if not os.path.isfile(self.df):
            data = {"fnames":[], "labels":[]}
            folder = self.root_dir
            for label in sorted(os.listdir(self.root_dir)):
                for fname in os.listdir(os.path.join(folder, label)):
                    data['fnames'].append(os.path.join(folder, label, fname))
                    data['labels'].append(label)
            assert(len(data['fnames'])==len(data['labels']))
            self.df = pd.DataFrame.from_dict(data)
            self.df.to_csv(os.path.join('data','iemocap.csv'), index=False)
        else:
            self.df= pd.read_csv(os.path.join('data','iemocap.csv'))

        if emotions != None:
            total_emotions = set(self.df.labels)
            #check            
            for e in emotions:
                #print(e, total_emotions)
                assert(e in total_emotions)

            emotion_2_elim = list(total_emotions.difference(emotions))
            idx = []
            for e in emotion_2_elim:
                index  = self.df.loc[self.df['labels']==e].index
                self.df = self.df.drop(index).reset_index(drop=True)
        # reindexing
        #self.df.index = range(len(self.df.fnames))

        if type(leave_one_out) != None:
            '''
            1 = Ses01 F
            2 = Ses01 M
            3 = Ses02 F
            4 = Ses02 M
            5 = Ses03 F
            6 = Ses03 M
            7 = Ses04 F
            8 = Ses04 M
            9 = Ses05 F
            10 = Ses05 M
            '''
            key = {1:'Ses01F', 
            2:'Ses01M', 
            3:'Ses02F', 
            4:'Ses02M', 
            5:'Ses03F',
            6:'Ses03M',
            7:'Ses04F',
            8:'Ses04M',
            9:'Ses05F',
            10:'Ses05M'}
            people_to_leave = set([key[i] for i in leave_one_out])
            idx =[]
            for i in self.df.index:
                fname = self.df.iat[i,0]
                fname = os.path.basename(fname)
                session,*_, gender = fname.split('_')
                speaker = session[0:-1]+gender[0]
                if speaker in people_to_leave:
                    idx.append(i)
            self.df = self.df.drop(idx).reset_index(drop=True)

        #reindexing
        #self.df.index = range(len(self.df.fnames))

        self.label2index = {label:index for index,label in enumerate(sorted(set(self.df.labels)))}

        self.label_array = np.array([self.label2index[label] for label in self.df.labels], dtype=int)
        self.fnames = self.df.fnames
        self.labels = self.df.labels

    def __len__(self):
        return len(self.df.index)

    def __getitem__(self,index):
        # Loading and preprocessing.
        assert(index != len(self.df.index))
        buffer = self.load_frames(self.fnames[index])
        buffer = self.crop(buffer, self.clip_len, self.crop_size)
        labels = np.array(self.label_array[index])

        # if self.split == 'test':
        #     # Perform data augmentation
        #     buffer = self.randomflip(buffer)
        buffer = self.normalize(buffer)
        buffer = self.to_tensor(buffer)
        return torch.from_numpy(buffer), torch.from_numpy(labels).long()

    def load_frames(self, file_dir):
        frames = sorted([os.path.join(file_dir, img) for img in os.listdir(file_dir)])
        frame_count = len(frames)
        buffer = np.empty((frame_count, self.resize_height, self.resize_width, 3), np.dtype('float32'))
        for i, frame_name in enumerate(frames):
            frame = np.array(cv2.imread(frame_name)).astype(np.float64)
            buffer[i] = frame

        return buffer

    def crop(self, buffer, clip_len, crop_size):
        # randomly select time index for temporal jittering
        time_index = np.random.randint(buffer.shape[0] - clip_len)

        # Randomly select start indices in order to crop the video
        height_index = np.random.randint(buffer.shape[1] - crop_size)
        width_index = np.random.randint(buffer.shape[2] - crop_size)

        # Crop and jitter the video using indexing. The spatial crop is performed on
        # the entire array, so each frame is cropped in the same location. The temporal
        # jitter takes place via the selection of consecutive frames
        buffer = buffer[time_index:time_index + clip_len,
                 height_index:height_index + crop_size,
                 width_index:width_index + crop_size, :]

        return buffer

    def normalize(self, buffer):
        for i, frame in enumerate(buffer):
            frame -= np.array([[[90.0, 98.0, 102.0]]])
            buffer[i] = frame

        return buffer

    def to_tensor(self, buffer):
        return buffer.transpose((3, 0, 1, 2))

vmirly1 · April 1, 2019, 1:12am

Does the error message indicate which line this out-of-bound happens?

Also, could you put print statements after each line of code in the __getitem__ function? Then, we can find out where this is happening.

Scott_Hoang · April 1, 2019, 1:46am

it happens at assert(index != len(self.df.index)) when index = 4297 which is also the length of self.df.index.
but to more exact, the error happens at buffer=self.load_frames(self.fnames[index]). I added the assertion to track it with pdb

justusschock · April 1, 2019, 6:56am

Index should never be 4297. Since your dataset holds 4297 items, the valid index range is from 0 to 4296, which makes a total of 4297 valid indices.

vmirly1 · April 2, 2019, 3:03am

I see. The way I usually debug these kinds of issues, I put print statements in multiple places to find the inconsistency. So, for example I would check the the size of self.df and self.frames in__getitem__ as well as different parts of __init__ with print('Inside getitem :: ', len(self.df), len(self.frames)).

Perhaps at some point these arrays/lists have different sizes, and that is causing the error.

Scott_Hoang · April 2, 2019, 3:36am

Hence the explosion. Might index exceeded what I can call from len(dataset) .