Getting error with custom data set when filtering my dataframe down

Hello,

The below custom data set worked and ran through my model just fine. The model was not doing well so I cut down on the types of photos from 10 to 3 and then the error popped up. I went back to 10 and it the data loader did just fine.

Why would a paired down data set throw the below error?

KeyError                                  Traceback (most recent call last)
~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2656             try:
-> 2657                 return self._engine.get_loc(key)
   2658             except KeyError:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 4305

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-366-d51e3bafe910> in <module>
----> 1 for image, label, policy, categorical_data in train_loader: #, numerical_data
      2     print(f"""           
      3 
      4           image size is {image.shape}
      5 

~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
    343 
    344     def __next__(self):
--> 345         data = self._next_data()
    346         self._num_yielded += 1
    347         if self._dataset_kind == _DatasetKind.Iterable and \

~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
    383     def _next_data(self):
    384         index = self._next_index()  # may raise StopIteration
--> 385         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    386         if self._pin_memory:
    387             data = _utils.pin_memory.pin_memory(data)

~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

<ipython-input-362-57af70cbdb57> in __getitem__(self, idx)
     26             idx = idx.tolist()
     27 
---> 28         label = self.image_frame.loc[idx, 'target']
     29         pic = Path(self.image_frame.loc[idx,'location'])
     30         img = Image.open(pic)

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexing.py in __getitem__(self, key)
   1492             except (KeyError, IndexError, AttributeError):
   1493                 pass
-> 1494             return self._getitem_tuple(key)
   1495         else:
   1496             # we by definition only have the 0th axis

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
    866     def _getitem_tuple(self, tup):
    867         try:
--> 868             return self._getitem_lowerdim(tup)
    869         except IndexingError:
    870             pass

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
    986         for i, key in enumerate(tup):
    987             if is_label_like(key) or isinstance(key, tuple):
--> 988                 section = self._getitem_axis(key, axis=i)
    989 
    990                 # we have yielded a scalar ?

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1911         # fall thru to straight lookup
   1912         self._validate_key(key, axis)
-> 1913         return self._get_label(key, axis=axis)
   1914 
   1915 

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
    139             raise IndexingError('no slices here, handle elsewhere')
    140 
--> 141         return self.obj._xs(label, axis=axis)
    142 
    143     def _get_loc(self, key, axis=None):

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
   3583                                                       drop_level=drop_level)
   3584         else:
-> 3585             loc = self.index.get_loc(key)
   3586 
   3587             if isinstance(loc, np.ndarray):

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2657                 return self._engine.get_loc(key)
   2658             except KeyError:
-> 2659                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2660         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2661         if indexer.ndim > 1 or indexer.size > 1:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 4305

Here is my data class:

    '''
    image class data set   
    
    '''
    def __init__(self, data, transform = None):
        '''
        Args:
        ------------------------------------------------------------
            data = dataframe
            image = column in dataframe with absolute path to the image
            label = column in dataframe that is the target classification variable
            numerical_columns =  numerical columns from data
            categorical_columns = categorical columns from data
            policy = ID variable
            
        '''
        self.image_frame = data
        self.transform = transform
        
    def __len__(self):
        return len(self.image_frame)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
         
        label = self.image_frame.loc[idx, 'target']
        pic = Path(self.image_frame.loc[idx,'location'])
        img = Image.open(pic)
        policy = self.image_frame.loc[idx, 'policy']
        #sample = {'image': img, 'policy': policy, 'label':label}
        #numerical_data = self.image_frame.loc[idx, numerical_columns]
        
        #numerical_data = torch.tensor(numerical_data, dtype = torch.float)

        if self.transform:
            image = self.transform(img)
            
        for category in categorical_columns:
            self.image_frame[category] = self.image_frame[category].astype('category')
            
            self.image_frame[category] = self.image_frame[category].astype('category').cat.codes.values
        
        #categorical_column_sizes = [len(self.image_frame[column].astype('category')) for column in categorical_columns]
        #categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_column_sizes]
            
        categorical_data = self.image_frame.loc[idx, categorical_columns]
        categorical_data = torch.tensor(categorical_data, dtype = torch.int64)
            
        return image, label, policy, categorical_data #, numerical_data

This is the line that throws the error although running the model throws the error as well:

for image, label, policy, categorical_data in train_loader: #, numerical_data
    print(f"""           

          image size is {image.shape} 
          
          categorical_data is {categorical_data.shape} 

          """)
    break
    #numeric size is {numerical_data.shape}

Could you explain how you “paired down” the dataset?
The error message points to a KeyError in your pandas DataFrame.
Could you make sure that the key=4305 is valid?

Thank you for the reply. I should have been more clear. I meant the following:

data = data1[(data1['Description'] == 'A') | (data1['Description'] == 'B)']

Yes key=4305 is valid if key means that observation. Everytime I run the script, the key number changes.

Update:

When I save the above filtered data set, and bring that into the scrip instead of filtering it inside the script, there is no error. Any idea what’s going on or what I’m doing wrong?