Iterate through customized DatasetFolder does not work

I have a lot of images with .gif and .oct-stream extension. Since the ImageFolder will ignore those files, I use the DatasetFolder and provide my img_extension and loader as suggested by other forks on this forum. I create a dataloader and try to iterate through it. Unfortunately, it got stuck somewhere forever.

The folder structure is the following. I have attached all images except ‘01.octet-stream’ as this forum does not support it.

# debug/0/01.octet-stream
# debug/0/02.jpeg
# debug/0/03.gif

# debug/1/01.octet-stream
# debug/1/02.jpeg
# debug/1/03.gif

Here is my code, you can run them directly on notebook.

from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import torch.nn.functional as F
import time
import os
import copy
from torchvision.datasets import ImageFolder

plt.ion()
data_transforms = transforms.Compose([transforms.Resize(300),
        transforms.CenterCrop(299),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

img_extensions = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.gif', '.octet-stream']
def my_loader(path):
    from torchvision import get_image_backend
    from PIL import Image
    
                
    def my_pil_loader(path):
        print ("loading {}".format(path))
        with open(path, 'rb') as f:
            img = Image.open(f)
            return img.convert('RGB')

    if get_image_backend() == 'accimage':
        print('{} uses accimage'.format(path))
        try:
            return accimage_loader(path)
        except IOError:
            print('{} accimage loading fail, using PIL'.format(path))
            return my_pil_loader(path)
    else:
        print('{} uses PIL'.format(path))
        return my_pil_loader(path)
my_loader('./debug/0/03.gif')

data_dir = './debug/'
batch_size = 32

image_datasets = datasets.DatasetFolder(data_dir, my_loader, img_extensions,
                                          data_transforms)
dataloaders = torch.utils.data.DataLoader(image_datasets, batch_size=batch_size,
                                             shuffle=True, num_workers=4)

dataset_sizes = len(image_datasets)
print(dataset_sizes)

Everything works fine so far. However, when I try to iterate through the dataloader and run the following code, the program got stuck forever! It seems the code runs into dead loop somewhere even before loading images as I do not see any print information during loading images. What’s wrong with implementation?

If I replace the DatasetFolder with ‘ImageFolder’ and get rid of the customized loader and extension, everything works fine. Very wired…

index = 0
for inputs, labels in dataloaders:
    print(index)
    print('inputs')
    print(inputs.size())
    print('labels')
    print(labels.size())

02 03

Is your Dataset working without the DataLoader, i.e. do you get a valid image using this code:

data = image_dataset[0]

If so, could you try to use num_workers=0 in your DataLoader and try it again?
I would like to narrow down the possible error source first.

1 Like

Hi Patrick

image_datasets[0] gives

./debug/0\01.octet-stream uses PIL
loading ./debug/0\01.octet-stream
(tensor([[[ 0.2967,  0.0912,  0.1254,  ..., -0.3369, -0.2513, -0.2856],
          [ 0.2111,  0.2282,  0.1768,  ..., -0.4054, -0.3369, -0.3198],
          [ 0.1939,  0.2796,  0.2282,  ..., -0.5596, -0.4568, -0.2684],
          ...,
          [-1.5870, -1.5014, -1.5699,  ..., -1.6555, -1.8097, -1.8439],
          [-1.6727, -1.5699, -1.5699,  ..., -1.9809, -1.9295, -1.7754],
          [-1.6384, -1.5185, -1.5528,  ..., -1.7069, -1.6898, -1.6042]],
 
         [[ 0.5728,  0.6604,  0.7304,  ..., -0.0924, -0.1099, -0.0749],
          [ 0.7304,  0.6254,  0.6954,  ..., -0.1975, -0.1450, -0.1099],
          [ 0.7304,  0.7129,  0.7479,  ..., -0.3025, -0.1800, -0.0924],
          ...,
          [-1.1253, -1.1429, -1.1429,  ..., -1.6331, -1.5630, -1.5980],
          [-1.0728, -1.1779, -1.2654,  ..., -1.4755, -1.5455, -1.5980],
          [-1.1604, -1.1429, -1.1604,  ..., -1.5980, -1.6155, -1.6155]],
 
         [[ 0.7054,  0.6356,  0.6531,  ...,  0.0605,  0.0431,  0.0256],
          [ 0.7576,  0.7402,  0.7228,  ...,  0.0605,  0.0431, -0.0092],
          [ 0.7576,  0.7925,  0.7925,  ..., -0.0267, -0.0615, -0.0441],
          ...,
          [-0.7064, -0.6715, -0.6541,  ..., -1.2990, -1.2467, -1.2467],
          [-0.7936, -0.6541, -0.6193,  ..., -1.3513, -1.3164, -1.1073],
          [-0.8284, -0.7064, -0.7064,  ..., -1.2816, -1.2990, -1.2119]]]), 0)

The issue seems to be in the dataloader iterator.

dataloader_iter = dataloaders.__iter__()
dataloader_iter.pin_memory

gives

False

It got stuck when I run
dataloader_iter.data_queue.get()

If I set the num_workers = 0, I got the following error

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-26-34fa306d18e2> in <module>()
----> 1 dataloader_iter._get_batch()

~\AppData\Local\Continuum\anaconda3\lib\site-packages\torch\utils\data\dataloader.py in _get_batch(self)
    307                 raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout))
    308         else:
--> 309             return self.data_queue.get()
    310 
    311     def __next__(self):

AttributeError: '_DataLoaderIter' object has no attribute 'data_queue

Is the code working, if you just set num_workers=0 and get the samples in a loop?

1 Like

Yes, it works! That’s amazing. Why setting num_workers = 0 will resolve the problem and run into dead loop if num_workers != 0?

BTW, I am running those codes within jupyter notebook on Windows with Pytorch 0.4.1 and python 3.6.5

Great to hear, it’s working now!
Multiprocessing is implemented a bit differently on Windows, e.g. it uses spawn instead on fork.
That means that you should guard your code with:

if __name__=='__main__':
    main()

You can read more about these differences in the Windows FAQ.

I’m not sure, how Jupyter notebooks should be handled on Windows machines, but I guess it might be related to the multiprocessing part.
Could you try to export your notebook as a Python script, add the guard, and try to run it again using more workers?

1 Like

Hi Patrick

Yes, it indeed works when I rewrite the code with if protection in the following way. Thanks!

My question is why the issue occurs only when I use the customized loader in the ‘datasetfolder’. If I use the ImageFolders with the default loader, this is no dead loop even without the ‘if’ protection.


# coding: utf-8

# In[ ]:


from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import torch.nn.functional as F
import time
import os
import copy
import sys
from torchvision.datasets import ImageFolder

plt.ion()


# In[ ]:


def my_loader(path):
    from torchvision import get_image_backend
    from PIL import Image
    
                
    def my_pil_loader(path):
        print ("loading {}".format(path))
        try:
            with open(path, 'rb') as f:
                img = Image.open(f)
                return img.convert('RGB')
        except:
            print('fail to load {} using PIL'.format(img))

    if get_image_backend() == 'accimage':
        print('loading {} uses accimage'.format(path))
        try:
            return accimage_loader(path)
        except IOError:
            print('fail to load {} using accimage, instead using PIL'.format(path))
            return my_pil_loader(path)
    else:
        print('{} uses PIL'.format(path))
        return my_pil_loader(path)


# In[ ]:


def main():
    data_transforms = transforms.Compose([transforms.Resize(300),
            transforms.CenterCrop(299),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    
    img_extensions = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.gif', '.octet-stream']
    
    data_dir = './debug/'
    batch_size = 32

    image_datasets = datasets.DatasetFolder(data_dir, my_loader, img_extensions,
                                              data_transforms)
    dataloaders = torch.utils.data.DataLoader(image_datasets, batch_size=batch_size, shuffle=True, num_workers=4)
    
    index = 0
    for inputs, labels in dataloaders:
        print(index)
        print('inputs')
        print(inputs.size())
        print('labels')
        print(labels.size())


# In[ ]:


if __name__ == '__main__':
    main()


I’m not really sure, what makes the DataLoader hang, but it’s good to hear it’s working now!
Your custom loader looks more or less like the default pil_loader besides the try block.

However, as Windows doesn’t use fork to create new workers, it might often be problematic, if you don’t use a “main guard”, as the whole module-level code will be executed in all child processes.

1 Like