DataLoader and custom dataloader size mismatch

Hi, I have written a dataloader class to load my data but when I check their size the numbers differ when specified a batch_size.

from torch.utils import data
class FlowerDataset(data.Dataset):
    def __init__(self, root_dir, text_file, transform = None, target_transform = None):
        self.root_dir = root_dir
        self.text_file = text_file
        self.name = pd.read_csv(self.root_dir + self.text_file).iloc[:, 0].apply(lambda x: '\\'.join(x.split('\\')[-2:]))
        self.label = pd.read_csv(self.root_dir + text_file).iloc[:, 0].apply(lambda x: x.split('\\')[-2])
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.name)

    def __getitem__(self, index):
        img_name = os.path.join(self.root_dir + self.text_file.split('.')[0], self.name[index])

        label = self.label[index]
        img = cv2.imread(img_name)

        if self.transform is not None:
            img = self.transform(img)

        label = torch.tensor(int(label))

        return img, label

    def __repr__(self):
        return "{}({} {}) ".format(self.__class__.__name__,
                                 self.root_dir,
                                 self.transform)

a = FlowerDataset(root_path, '\\train.txt', transform = transforms.Compose([
    transforms.ToPILImage(mode = 'RGB'),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
]))
data = DataLoader(a, batch_size = 64, shuffle = True)
print(len(data) * 64, len(a))
>>> 6592 6551
data = DataLoader(a, batch_size = 1, shuffle = True)
print(len(data), len(a))
>>> 6551 6551

can anyone tell me what could be the problem?

The number of samples in your Dataset cannot be divided by the batch size without a remainder.
Therefore, your last batch will contain less samples then specified by batch_size.
If you want to get rid of this batch, use drop_last=True in your DataLoader.

2 Likes