IndexError on Training

sciarrilli · January 27, 2020, 11:39pm

I am working on training a CNN with transfer learning on resnet50. My script was working until I made some modifications. I manually split my dataset into train and validation folders. And I setup weights on the training dataset since it was unbalanced. I am receiving an error when I run the training. Here is my code and the error:

train_data_dir = '/home/ec2-user/SageMaker/data/frame_150/train'
val_data_dir = '/home/ec2-user/SageMaker/data/frame_150/val'
batch_size = 100
data_transforms = {
    'train': transforms.Compose([transforms.ToTensor(),
                                 transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]),
    'val': transforms.Compose([transforms.ToTensor(),
                                 transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])  
}
image_datasets = {'train': datasets.ImageFolder(train_data_dir, data_transforms['train']),
                  'val': datasets.ImageFolder(val_data_dir, data_transforms['val'])
               }
def make_weights_for_balanced_classes(images, nclasses):                        
    count = [0] * nclasses                                                      
    for item in images:                                                         
        count[item[1]] += 1                                                     
    weight_per_class = [0.] * nclasses                                      
    N = float(sum(count))                                                   
    for i in range(nclasses):                                                   
        weight_per_class[i] = N/float(count[i])                                 
    weight = [0] * len(images)                                              
    for idx, val in enumerate(images):                                          
        weight[idx] = weight_per_class[val[1]]                                  
    return weight  

weights = make_weights_for_balanced_classes(image_datasets['train'].imgs, len(image_datasets['train'].classes))                                                                
weights = torch.DoubleTensor(weights)                                       
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)) 

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, sampler=sampler, num_workers=4) for x in ['train', 'val']}

dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)                    
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

model_ft = models.resnet50(pretrained=True)
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Linear(num_ftrs, 2)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=6)

ERROR:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-29-c677c5f760af> in <module>()
      1 model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
----> 2                        num_epochs=6)

<ipython-input-26-6f4f613da14c> in train_model(model, criterion, optimizer, scheduler, num_epochs)
     20 
     21             # Iterate over data.
---> 22             for inputs, labels in dataloaders[phase]:
     23                 inputs = inputs.to(device)
     24                 labels = labels.to(device)

~/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py in __next__(self)
    343 
    344     def __next__(self):
--> 345         data = self._next_data()
    346         self._num_yielded += 1
    347         if self._dataset_kind == _DatasetKind.Iterable and \

~/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _next_data(self)
    854             else:
    855                 del self._task_info[idx]
--> 856                 return self._process_data(data)
    857 
    858     def _try_put_index(self):

~/.local/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _process_data(self, data)
    879         self._try_put_index()
    880         if isinstance(data, ExceptionWrapper):
--> 881             data.reraise()
    882         return data
    883 

~/.local/lib/python3.6/site-packages/torch/_utils.py in reraise(self)
    392             # (https://bugs.python.org/issue2651), so we work around it.
    393             msg = KeyErrorMessage(msg)
--> 394         raise self.exc_type(msg)

IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/ec2-user/.local/lib/python3.6/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/ec2-user/.local/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/ec2-user/.local/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/ec2-user/.local/lib/python3.6/site-packages/torchvision/datasets/folder.py", line 137, in __getitem__
    path, target = self.samples[index]
IndexError: list index out of range

MSA · January 28, 2020, 2:51am

Can you try that: drop_last=True

DataLoader(dataset, batch_size=1, shuffle=False, sampler=None,
           batch_sampler=None, num_workers=0, collate_fn=None,
           pin_memory=False, drop_last=False, timeout=0,
           worker_init_fn=None)

https://pytorch.org/docs/stable/data.html

drop_last ( bool , optional ) – set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. (default: False )

Rancho_Xia · December 21, 2021, 11:05am

Hi. Thanks for your reply! I encountered the same problem and setting drop_last to True really worked. Could you please explain why this change fixes this error? I always set drop_last=False but this is the first time that this error occurred