KeyError : DataLoader Error

Ayush_Agarwal · December 11, 2018, 10:41pm

class Planet(Dataset):
def init(self,df,img_path,img_ext,transform=None):

    df['image_name'].apply(lambda x:os.path.isfile(img_path + x + img_ext)).all()
    self.mlb=MultiLabelBinarizer()
    self.img_path=img_path
    self.img_ext=img_ext
    self.transform=transform
    self.Xtrain=df['image_name'] + self.img_ext
    self.ytrain=self.mlb.fit_transform(df['tags'].str.split()).astype(np.float32)
    
def __getitem__(self,index):
    
    img=Image.open(os.path.join(self.img_path , self.Xtrain[index] )).convert('RGB')
    
    if self.transform is not None:
        img=self.transform(img)
    label=torch.from_numpy(self.ytrain[index])
    return img,label
def __len__(self):
    return len(self.Xtrain.index)

def dataloader(df,img_path,img_ext,shuffle=False,batch_size=64,transform=None):
if transform==True:
tns=transforms.Compose([

                      transforms.ToTensor(),
                      ])
else:
    tns=None
data=Planet(df,img_path,img_ext,transform=tns)
return DataLoader(data,shuffle=shuffle,batch_size=batch_size,pin_memory=use_cuda,num_workers=8)

KeyError Traceback (most recent call last)
in ()
----> 1 model_ft=train(num_epochs=5,model=model,datasets=datasets)

in train(num_epochs, model, datasets)
16
17 # Iterate over data.
—> 18 for inputs, labels in datasets[phase]:
19 inputs = inputs.to(device)
20 labels = labels.to(device)

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/dataloader.py in next(self)
334 self.reorder_dict[idx] = batch
335 continue
–> 336 return self._process_next_batch(batch)
337
338 next = next # Python 2 compatibility

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _process_next_batch(self, batch)
355 self._put_indices()
356 if isinstance(batch, ExceptionWrapper):
–> 357 raise batch.exc_type(batch.exc_msg)
358 return batch
359

KeyError: ‘Traceback (most recent call last):\n File “/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/dataloader.py”, line 106, in _worker_loop\n samples = collate_fn([dataset[i] for i in batch_indices])\n File “/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/utils/data/dataloader.py”, line 106, in \n samples = collate_fn([dataset[i] for i in batch_indices])\n File “”, line 14, in getitem\n img=Image.open(os.path.join(self.img_path , self.Xtrain[index] )).convert(‘RGB’)\n File “/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/pandas/core/series.py”, line 623, in getitem\n result = self.index.get_value(self, key)\n File “/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/pandas/core/indexes/base.py”, line 2560, in get_value\n tz=getattr(series.dtype, ‘tz’, None))\n File “pandas/_libs/index.pyx”, line 83, in pandas._libs.index.IndexEngine.get_value\n File “pandas/_libs/index.pyx”, line 91, in pandas._libs.index.IndexEngine.get_value\n File “pandas/_libs/index.pyx”, line 139, in pandas._libs.index.IndexEngine.get_loc\n File “pandas/_libs/hashtable_class_helper.pxi”, line 811, in pandas._libs.hashtable.Int64HashTable.get_item\n File “pandas/_libs/hashtable_class_helper.pxi”, line 817, in pandas._libs.hashtable.Int64HashTable.get_item\nKeyError: 5195\n’

Can someone help me to Solve the error occurred

rasbt · December 12, 2018, 1:40am

Looks like a pandas error to me, i.e., I think you are trying to look up a column that doesn’t exist. Have you checked that self.ytrain[index] is a numpy array? otherwise, I think in pandas you need to use sth like self.ytrain.irow[index]

vmirly1 · December 12, 2018, 2:52am

In the above quoted lines, df remains as a pandas DataFrame, therefore, self.Xtrain will be Pandas-Series. I tried the following with a similar dataframe:

>>> img_ext = '.jpg'
>>> X = df['filename'] + img_ext
>>> print(X[:5])
0    353640_00M22.jpg
1    353640_00M22.jpg
2    353640_00M22.jpg
3    353640_00M22.jpg
4    353640_00M22.jpg
Name: filename, dtype: object


>>> type(X)
pandas.core.series.Series

So, if you want to change that, you can use df[image_name].values + img_ext:

>>> X = df['filename'].values + img_ext
>>> type(X)
numpy.ndarray

Also, note that this line does not o any thing:

df['image_name'].apply(lambda x:os.path.isfile(img_path + x + img_ext)).all()

If you want, you have to assign it:

df['image_name'] = df['image_name'].apply(lambda x:os.path.isfile(img_path + x + img_ext)).all()