Getting loss 0.000 and keyerror while inferencing

my train set contains 51000 (3 channel) images.i am trying to get those images for training from csv file / dataframe containing 2 columns images and label. for example : traindataset.loc[0][0] means ‘/kaggle/input/alaska2-image-steganalysis/UERD/00155.jpg’ which is first image directory and traindataset.loc[0][1] means ‘1’ which is label of that image , there exist 2 labels (1,0) so it’s a binary classification problem
but i was unable to figure out if there exist any bug in my code or not.
here is my dataloader for train set :

class decode_images(Dataset):

    def __init__(self, file):

        self.data = file

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        #print(idx)
        img_name =  self.data.loc[idx][0]
        image = Image.open(img_name)
        image = image.resize((512, 512), resample=Image.BILINEAR)
        label = self.data.loc[idx][1] #torch.tensor(self.data.loc[idx, 'label'])
        return {'image': transforms.ToTensor()(image),
                'label': label
                }

train_dataset = decode_images(traindataset)  #traindataset is a dataframe containing images and labels(0,1)

# simple model

model = torchvision.models.resnet101(pretrained=False)
model.load_state_dict(torch.load("../input/pytorch-pretrained-models/resnet101-5d3b4d8f.pth"))
num_features = model.fc.in_features
model.fc = nn.Linear(2048, 1)
device = torch.device("cuda:0")
#device = torch.device("cpu")
model = model.to(device)

data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)

plist = [
         {'params': model.layer4.parameters(), 'lr': 1e-4, 'weight': 0.001},
         {'params': model.fc.parameters(), 'lr': 1e-3}
         ]

optimizer = optim.Adam(plist, lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10)


since = time.time()
criterion = torch.nn.CrossEntropyLoss()
num_epochs = 1
for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)
    scheduler.step()
    model.train()
    running_loss = 0.0
    tk0 = tqdm(data_loader, total=int(len(data_loader)))
    counter = 0
    for bi, d in enumerate(tk0):
        inputs = d["image"]
        labels = d["label"].view(-1, 1)
        inputs = inputs.to(device, dtype=torch.float)
        labels = labels.to(device, dtype=torch.float)
        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            outputs = model(inputs)
            #loss = criterion(outputs, labels)
            loss = criterion(outputs, torch.max(labels, 1)[1])
            #print(loss)
            loss.backward()
            optimizer.step()
        running_loss += loss.item() * inputs.size(0)
        counter += 1
        tk0.set_postfix(loss=(running_loss / (counter * data_loader.batch_size)))
    epoch_loss = running_loss / len(data_loader)
    print('Training Loss: {:.4f}'.format(epoch_loss))

time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
torch.save(model.state_dict(), "model.bin")

with the code above i get this output :

Training Loss: 0.0000
Training complete in 34m 53s

so my question is why loss : 0.0000

then i tried to make test set prediction like this :


class decode_images(Dataset):

    def __init__(self, csv_file):

        self.data = csv_file#pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        #print(idx)
        img_name =  self.data.loc[idx][0]
        image = Image.open(img_name)
        image = image.resize((512, 512), resample=Image.BILINEAR)
        #label = self.data.loc[idx][1] #torch.tensor(self.data.loc[idx, 'label'])
        #image = self.transform(image)
        return {'image': image}

test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False) # test_set contains only images directory

for param in model.parameters():
    param.requires_grad = False


tk0 = tqdm(test_loader)
for i, x_batch in enumerate(tk0):
    print(i)
    print(x_batch)
    x_batch = x_batch["image"]
    pred = model(x_batch.to(device))
    sub.Label[i] = pred

and now i get this error : 

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-41-fd6bbd63a0bb> in <module>
      1 tk0 = tqdm(test_loader)
----> 2 for i, x_batch in enumerate(tk0):
      3     print(i)
      4     print(x_batch)
      5     x_batch = x_batch["image"]

/opt/conda/lib/python3.7/site-packages/tqdm/notebook.py in __iter__(self, *args, **kwargs)
    216     def __iter__(self, *args, **kwargs):
    217         try:
--> 218             for obj in super(tqdm_notebook, self).__iter__(*args, **kwargs):
    219                 # return super(tqdm...) will not catch exception
    220                 yield obj

/opt/conda/lib/python3.7/site-packages/tqdm/std.py in __iter__(self)
   1106                 fp_write=getattr(self.fp, 'write', sys.stderr.write))
   1107 
-> 1108         for obj in iterable:
   1109             yield obj
   1110             # Update and possibly print the progressbar.

/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __next__(self)
    343 
    344     def __next__(self):
--> 345         data = self._next_data()
    346         self._num_yielded += 1
    347         if self._dataset_kind == _DatasetKind.Iterable and \

/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self)
    383     def _next_data(self):
    384         index = self._next_index()  # may raise StopIteration
--> 385         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    386         if self._pin_memory:
    387             data = _utils.pin_memory.pin_memory(data)

/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2798             if self.columns.nlevels > 1:
   2799                 return self._getitem_multilevel(key)
-> 2800             indexer = self.columns.get_loc(key)
   2801             if is_integer(indexer):
   2802                 indexer = [indexer]

/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0

CrossEntropyLoss is used for a multi-class classification and expects the model output to have the shape [batch_size, nb_classes] and the target [batch_size] containing the class indices.

In your case the last linear layer has a single output unit, which would define nb_classes = 1.
Your target might therefore only contain zeros?

In this setup your model won’t learn anything, as it only needs to output a high value for the single output unit to predict the “right” class0.

Could you explain your use case a bit, i.e. how many classes are you dealing with and what does target contain?

@ptrblck my target contains 0 and 1 and i solved above problem with nn.BCEWithLogitsLoss instead of CrossEntropyLoss but my 2nd problem is during inference,i still get the same error while inferencing or making prediction using this model,please help with 2nd part of this post,thanks

@ptrblck my updated code for making prediction on test set, i still get the same error as before,please check the code :

class decode_images(Dataset):

    def __init__(self, csv_file):

        self.data = csv_file

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name =  self.data.loc[idx][0]
        image = Image.open(img_name)
        image = image.resize((512, 512), resample=Image.BILINEAR)
        #image = self.transform(image)
        return {'image': transforms.ToTensor()(image)}

test_loader = torch.utils.data.DataLoader(testdataset, batch_size=1, shuffle=False) # test_set contains only images directory

for param in model.parameters():
    param.requires_grad = False


tk0 = tqdm(test_loader)
for i, x_batch in enumerate(tk0):
    print(i)
    print(x_batch)
    x_batch = x_batch["image"]
    pred = model(x_batch.to(device))
    sub.Label[i] = pred

I don’t know if this helps, but when I had the zero loss problem, it was due to not normalizing the input image data before transforming it into a tensor. This may or may not be your solution.

PyTorch has the below feature to normalize input values.
](https://pytorch.org/docs/stable/_modules/torchvision/transforms/transforms.html#Normalize)
``

@ptrblck @7029279

i just solved error but unfortunately my models are predicting class like 0,1,2,3,4 but i wanted it to generate confident score for each image like 0.7345, 0.48045 etc
i think it’s because of the loss function i am using,is there anything wrong in my code? how can i get this model to generate confident score? should i try MSEloss or crossentropy? but if i use crossentropy and do loss = criterion(outputs, torch.max(labels, 1)[1]) then i get loss = 0.000 any help?

If you replaced the last linear layer with model.fc = nn.Linear(2048, 1), I’m not sure how you are getting class predictions in the range [0, 4], as the model output would be a single value for each sample.

Could you explain how you get these predictions, please?

As explained before:

@ptrblck please check this : https://www.kaggle.com/mobassir/pytorch-transfer-learning-baseline

in version 1 of that notebook you can see after prediction predicted values other than 0 and 1 in submission file and my linear layer is of shape (2048, 1).

I hope this will help you understand the error for sure CrossEntropyLoss

just solved the issue,model was fine,it the pandas dataframe column that was of int type,sorry my bad. thanks for your support and help mates <3

1 Like

A KeyError means the key you gave pandas isn’t valid. Before doing anything with the data frame, use print(df.columns) to see what keys are available.

print(df.columns)

I was getting a similar kind of error in one of my codes. Turns out, that particular index was missing from my data frame as I had dropped the empty dataframe rows. If this is the case, you can do df.reset_index(inplace=True) and the error should be resolved.