Obtain filenames from Dataloader ("test loader") in k-fold cross validation

Hi,

I was trying to extract filenames along with features from the last fully connected layer FCN of resnet50 i.e. ((2048,)).

But, when I use kfold validation, during testloader, the filenames are repeating after 4 rows, whereas the inputs, targets are unique and changing as expected.

Any idea of how to get the correct filenames for each case in testloader?

Below is the full script file:

Thanks in advance

import os
import torch
from torch import nn
from torch.utils.data import DataLoader, ConcatDataset
from torchvision import transforms
import torchvision
from sklearn.model_selection import KFold
from torchvision import datasets, transforms, models
from tqdm import tqdm
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

transforms = {
    'Data': transforms.Compose([
        transforms.Resize([224,224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    }

# data_dir = '/content/drive/MyDrive/Colab Notebooks/CBIR study/Dataset/map_data/Data'
data_dir = '/content/drive/MyDrive/Colab Notebooks//Dataset/'
dataset = datasets.ImageFolder(data_dir, transforms['Data'])


# data_set_data_dir = DataLoader(dataset=dataset, shuffle=True)
# dataset = torch.utils.data.DataLoader(dataset, shuffle=True)
dataset



if __name__ == '__main__':
  
  # Configuration options
  k_folds = 5
  num_epochs = 5
  loss_function = nn.CrossEntropyLoss()

  # List to save final features with Name, target
  results = []
  
  # Set fixed random number seed
  torch.manual_seed(42)
  
    # Define the K-fold Cross Validator
  kfold = KFold(n_splits=k_folds, shuffle=True)
    
  # Start print
  print('--------------------------------')

  # K-fold Cross Validation model evaluation
  for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
    
    # Print
    print(f'FOLD {fold}')
    print('--------------------------------')
    
    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    
    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=4, sampler=train_subsampler)
    testloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=1, sampler=test_subsampler)
    
    print('\nKfold: {%d}' %(fold+1))
    print('--------------------------------')

    
    # Init the neural network
    #network = SimpleConvNet()
    #network.apply(reset_weights)
    network = models.resnet50(pretrained=True)
    

    #network.apply(reset_weights)
    num_ftrs = network.fc.in_features
    # Here the size of each output sample is set to 3.
    # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
    network.fc = nn.Linear(num_ftrs, 2)

    network = network.to(device)

    
    # Initialize optimizer
    optimizer = torch.optim.Adam(network.parameters(), lr=1e-4)

    
    # Run the training loop for defined number of epochs
    for epoch in range(0, num_epochs):

      # Print epoch
      print(f'Starting epoch {epoch+1}')

      # Set current loss value
      current_loss = 0.0

      # Iterate over the DataLoader for training data
      for i, data in enumerate(trainloader):
        network.train()
        
        # Get inputs
        inputs, targets = data

        inputs = inputs.to(device)
        targets = targets.to(device)

        # Zero the gradients
        optimizer.zero_grad()
        
        # Perform forward pass
        outputs = network(inputs)
        # outputs = outputs.to(device)

        # Compute loss
        loss = loss_function(outputs, targets)
        
        # Perform backward pass
        loss.backward()
        
        # Perform optimization
        optimizer.step()
        
        # Print statistics
        current_loss += loss.item()
        if i % 50 == 49:
            print('Loss after mini-batch %5d: %.3f' %
                  (i + 1, current_loss / 50))
            current_loss = 0.0
            
    # Process is complete.
    print('Training process has finished. Saving trained model.')

    # Print about testing
    print('Starting testing')
    
    # Saving the model

    base = '/content/drive/MyDrive/Colab Notebooks/CBIR study/save_models'
    save_path = base + f'/model-fold-{fold}.pth'
    torch.save(network.state_dict(), save_path)

    # Evaluation for this fold

    ### strip the last layer to use as feature_extractor
    feature_extractor = torch.nn.Sequential(*list(network.children())[:-1])
    feature_extractor = feature_extractor.to(device)

    with torch.no_grad():

      # Iterate over the test data and generate predictions
      for i, data in enumerate(testloader):
        feature_extractor.eval()

        # Get inputs
        inputs, targets = data

        inputs = inputs.to(device)
        targets = targets.to(device)


        sample_fname = testloader.dataset.samples[i][0]
        sample_fname = sample_fname.rsplit('/')[-1]
        # print(i, inputs.shape, targets, sample_fname)

        # Generate outputs
        # outputs = networks(inputs)

        feature_tensor = feature_extractor(inputs) # output now has the features corresponding to input x
        feature_arr = feature_tensor.cpu().detach().numpy().flatten()

        results.append([sample_fname, targets, feature_arr])


flat_list1 = []
flat_list2 = []
for sublist in results:

  flat_list1.append([sublist[0], sublist[1]])
  flat_list2.append(list(sublist[2].flatten()))

flat_list1



Output: 

[['case_0096_ARP0134_0901_LML.jpg', tensor([0], device='cuda:0')],
 ['case_0097_ARP0135_0901_LML.jpg', tensor([1], device='cuda:0')],
 ['case_0098_ARP0136_0901_RML.jpg', tensor([0], device='cuda:0')],
 ['case_0099_ARP0138_0901_RCC.jpg', tensor([0], device='cuda:0')],
 ['case_0096_ARP0134_0901_LML.jpg', tensor([0], device='cuda:0')],
 ['case_0097_ARP0135_0901_LML.jpg', tensor([0], device='cuda:0')],
 ['case_0098_ARP0136_0901_RML.jpg', tensor([0], device='cuda:0')],
 ['case_0099_ARP0138_0901_RCC.jpg', tensor([1], device='cuda:0')],
 ['case_0096_ARP0134_0901_LML.jpg', tensor([1], device='cuda:0')],
 ['case_0097_ARP0135_0901_LML.jpg', tensor([0], device='cuda:0')],
 ['case_0098_ARP0136_0901_RML.jpg', tensor([1], device='cuda:0')],
 ['case_0099_ARP0138_0901_RCC.jpg', tensor([1], device='cuda:0')],
 ['case_0096_ARP0134_0901_LML.jpg', tensor([0], device='cuda:0')],
 ['case_0097_ARP0135_0901_LML.jpg', tensor([1], device='cuda:0')],
 ['case_0098_ARP0136_0901_RML.jpg', tensor([1], device='cuda:0')],
 ['case_0099_ARP0138_0901_RCC.jpg', tensor([0], device='cuda:0')],
 ['case_0096_ARP0134_0901_LML.jpg', tensor([1], device='cuda:0')],
 ['case_0097_ARP0135_0901_LML.jpg', tensor([1], device='cuda:0')],
 ['case_0098_ARP0136_0901_RML.jpg', tensor([0], device='cuda:0')],
 ['case_0099_ARP0138_0901_RCC.jpg', tensor([1], device='cuda:0')]]

In the above output, we can see the filenames are repeating after every four, wheras the target tnesor is updating

Yeah, I’m also facing a similar issue. PyTorch representatives should fix this issue.

As it is, it is difficult to reproduce this output because we don’t know the directory structure that you are working with and there is a lot of code unrelated to the cross-validation/dataloading process. Can you give an example of the directory structure and a minimal example that presents the issue?

data_dir = ‘/content/drive/MyDrive/Colab Notebooks/CBIR study/Dataset/temp’

data_dir (temp) contains two folders, class 0 and class 1 containing `jpg’ images belonging to each class

Are the filenames/labels of the first four examples correct?

The file name representation is right but I do not have a way to verify if first four belong to first four target tensor outputs

The names exist in my original file names

I just verified the cases, the four names here are the first file names in my class 0.

They should all have a label 0.

But I think due to kfold, they are shuffling probably the target label are coming correctly. The file names however are always only this

What happens when you iterate through the dataset without using the k-fold indices to subsample?

I couldn’t reproduce this behavior with a minimal example:

import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold

inpts = torch.randn(32, 3, 224, 224)
tgts = torch.rand(32) > 0.5
dataset = TensorDataset(inpts, tgts)

kfolds = 8
kfold = KFold(n_splits=kfolds, shuffle=True)

for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
    print(f'FOLD {fold}')
    print(train_ids, test_ids)
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    trainloader = DataLoader(dataset, batch_size=4, sampler=train_subsampler)
    for batch_id, batch in enumerate(trainloader):
        print(type(batch))
        print(batch_id, batch[0][:,0,0,0], batch[1])
    break
FOLD 0
[ 0  1  3  4  5  6  7  8 10 11 12 13 14 15 16 17 18 19 21 22 23 24 25 26
 27 28 29 31] [ 2  9 20 30]
<class 'list'>
0 tensor([-0.4450,  0.6853,  0.8742,  1.1024]) tensor([False,  True, False, False])
<class 'list'>
1 tensor([-0.2127,  0.4061,  1.4086, -0.1292]) tensor([ True, False,  True, False])
<class 'list'>
2 tensor([0.9943, 0.2960, 1.1243, 0.5274]) tensor([False, False,  True,  True])
<class 'list'>
3 tensor([ 1.5628, -1.0033,  1.9009, -1.5863]) tensor([ True, False, False, False])
<class 'list'>
4 tensor([-0.2431,  0.5888, -0.5383, -1.6847]) tensor([False,  True,  True,  True])
<class 'list'>
5 tensor([-0.8152,  1.0058, -0.8156, -1.0477]) tensor([ True, False, False,  True])
<class 'list'>
6 tensor([-0.7050,  1.3312,  1.1899, -0.1762]) tensor([ True, False,  True, False])

Actually, this part of the code looks suspicious:

        # Get inputs
        inputs, targets = data

        inputs = inputs.to(device)
        targets = targets.to(device)


        sample_fname = testloader.dataset.samples[i][0]

If the dataloader shuffles while the underlying dataset is not shuffled, then this explains why the dataset shows the first four filenames each time while the data is correctly shuffled.

If you need to use the filenames along with the image data, you might consider modifying the DatasetFolder class to return the filename along with the raw inputs and targets.
https://pytorch.org/vision/stable/_modules/torchvision/datasets/folder.html#DatasetFolder