Create custom dataset from multiple folders

I have some images organized in folders as shown in the following picture:

image

In order to create a PyTorch DataLoader I defined a custom Dataset in this way

class CustomDataset(Dataset):
  def __init__(self, root, dirs=None, transforms=None):
    self.root_dir = root
    self.sel_dirs = dirs
    self.transforms = transforms

  def __getFileNr(self, path):
    count = 0

    for f in os.listdir(path):
      absPathIn = os.path.join(path, f)

      if os.path.isdir(absPathIn):
        count += self.__getFileNr(absPathIn)  
      elif os.path.isfile(absPathIn):
        count += 1

    return count

  def __len__(self):
    if self.sel_dirs == None:
      return

    total_count = 0
    for dir in self.sel_dirs:
      path = os.path.join(self.root_dir, dir)

      if os.path.isfile(path):
        total_count += 1
      elif os.path.isdir(path):
        count = 0
        count += self.__getFileNr(path)
        total_count += count

    return total_count

  def __checkDir(self, dirPath):
    images = []
    labels = []

    for f in os.listdir(dirPath):
      absDirPath = os.path.join(dirPath, f)

      if os.path.isdir(absDirPath):
        tmpImgs, tmpLabels = self.__checkDir(absDirPath)
        images.extend(tmpImgs)
        labels.extend(tmpLabels)

      elif os.path.isfile(absDirPath):
        images.append(absDirPath)
        fileName = os.path.splitext(os.path.split(absDirPath)[1])[0]
        labels.append(fileName.split('_')[-1])
        
    return (images, labels)

  def __getitem__(self):
    images = []
    labels = []
        
    for dir in self.sel_dirs:
      dirPath = os.path.join(self.root_dir, dir)
 
      if os.path.isdir(dirPath):
        tmpImgs, tmpLabels = self.__checkDir(dirPath)
        images.extend(tmpImgs)
        labels.extend(tmpLabels)
      else:
        return

    if self.transforms and len(images) > 0:
      images = self.transforms(images)

    return images, labels

Everything seems to work, I’m new to PyTorch and I’m not sure this is a good solution, however when I try to create the DataLoader, I obtain the following error message:

I defined the function to create the custom Dataset as follow:

def get_dataset(path, dirs):

  transforms = T.Compose([
                          T.ToPILImage(),
                          T.Resize((224,224)),
                          T.RandomHorizontalFlip(),
                          T.ToTensor()
                        ])

  dataset = CustomDataset(root=path, dirs=dirs, transforms=transforms)

  return dataset

The name of every image is format filename_label.jpg.

I understand the error, but I don’t know how to fix it.
Can you help me please?

Your __getitem__ method is used to load and process a single sample in the default use case using the passed index argument.
In your code snippet it seems as if you would like to load and process the entire dataset. Is this indeed the case?

The error is raised in:

images = self.transforms(images)

since you are passing a list of image paths while a tensor or PIL.Image is expected.

Probably I didn’t understand how to create a custom dataset and how to use the __getiem__.

With your hit I rewrote the code:

class CustomDataset(Dataset):
  def __init__(self, root, dirs=None, transform=None):
    self.status = True

    self.root_dir = root
    self.sel_dirs = dirs
    self.transform = transform

    self.images = []
    self.labels = []
    self.__initItem()

  def getStatus(self):
    return self.status

  def __len__(self):
    if self.sel_dirs == None:
      return 0

    if len(self.images) == len(self.labels):
      return len(self.images)
    else:
      return 0

  def getImagesLabels(self):
    for (image, label) in zip(self.images, self.labels):
      print(f'{image} - {label}')

  def __checkDir(self, dirPath):
    images = []
    labels = []

    for f in os.listdir(dirPath):
      absDirPath = os.path.join(dirPath, f)

      if os.path.isdir(absDirPath):
        tmpImgs, tmpLabels = self.__checkDir(absDirPath)
        images.append(tmpImgs)
        labels.append(tmpLabels)
      elif os.path.isfile(absDirPath):
        images.append(absDirPath)
        fileName = os.path.splitext(os.path.split(absDirPath)[1])[0]
        labels.append(fileName.split('_')[-1])
        
    return (images, labels)

  def __initItem(self):
    images = []
    labels = []

    if self.sel_dirs == None:
      print(f'No selected directories')
      self.status = False
      return

    for dir in self.sel_dirs:
      dirPath = os.path.join(self.root_dir, dir)
      print('path: ' + dirPath)

      if os.path.isdir(dirPath):
        tmpImgs, tmpLabels = self.__checkDir(dirPath)
        images.extend(tmpImgs)
        labels.extend(tmpLabels)
      else:
        print(f'Data which is not a folder found into selected directories')
        self.status = False
        return

    if VERBOSE >= 1:
      print('found ' + str(len(images)) + ' images and ' + str(len(labels)) + ' labels')

    self.images = images
    self.labels = labels

  def __getitem__(self, index):
    image = io.imread(self.images[index])
    label = torch.tensor(int(self.labels[index]))

    if self.transform:
      image = self.transform(image)
    
    return (image, label)

Now everything seems to work well.