Dataloader always failed with Interrupted system call

Yin_Zheng · November 22, 2018, 3:25am

Hi,

The “RuntimeError: Interrupted system call at /Users/soumith/code/builder/wheel/pytorch-src/torch/lib/libshm/core.cpp:125” always happens.

Could you please help me to solve this problem? The details to reproduce the code is as follows:

DETAILS:

I have a dataset MC, which is defined as:

class MC(data.Dataset):
   
    base_folder = 'mc'
    train_data = 'train'
    valid_data = 'valid'
    test_data = 'test'

    def __init__(self, root, mode=0, random_seed=1234, transform=None,
                 target_transform=None
                 ):
        self.root = os.path.expanduser(root)
        self.transform = transform
        self.target_transform = target_transform
        self.mode = mode
        assert mode in [0, 1, 2]  # 0: train_set, 1: valid_set, 2:test_set
        if mode == 0:
            f = self.train_data
        elif mode == 1:
            f = self.valid_data
        else:
            f = self.test_data
        random.seed(random_seed)

        # now load the picked numpy arrays
        self.maps = {}  # the indices for each classes

        file = os.path.join(self.root, self.base_folder, f)
        fo = open(file, 'rb')
        if sys.version_info[0] == 2:
            entry = pickle.load(fo)
        else:
            entry = pickle.load(fo, encoding='latin1')
        self.data = entry['data']
        self.labels = entry['labels']
        fo.close()

        self.data = self.data.reshape((-1, 3, 32, 32))
        self.data = self.data.transpose((0, 2, 3, 1))

        class_ids = list(set(self.labels))
        labels = np.array(self.labels)
        for c_id in class_ids:
            ids = np.where(labels == c_id)[0].tolist()
            random.shuffle(ids)
            self.maps[c_id] = ids

    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is index of the target class.
        """
        img, target = self.data[index], self.labels[index]

        # doing this so that it is consistent with all other datasets
        # to return a PIL Image
        img = Image.fromarray(img)

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def __len__(self):
        return len(self.data)

    def __repr__(self):
        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
        if self.mode == 0:
            tmp = 'train'
        elif self.mode == 1:
            tmp = 'valid'
        else:
            tmp = 'test'
        fmt_str += '    Split: {}\n'.format(tmp)
        fmt_str += '    Root Location: {}\n'.format(self.root)
        tmp = '    Transforms (if any): '
        fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
        tmp = '    Target Transforms (if any): '
        fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
        return fmt_str

And I have 3 queues, ‘train_queue’, ‘valid_queue’ and ‘test_queue’ defined as follows:

import dataset.mc as dset
import utils
import torch
import torchvision.transforms as transforms
import numpy as np

CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]

train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
])
train_data = dset.MC(root="../data", mode=0, transform=train_transform)
valid_data = dset.MC(root="../data", mode=1, transform=train_transform)
test_data = dset.MC(root="../data", mode=2, transform=train_transform)

train_queue = torch.utils.data.DataLoader(
        train_data, batch_size=32,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(None),
        pin_memory=True, num_workers=1
    )
valid_queue = torch.utils.data.DataLoader(
        valid_data, batch_size=32,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(None),
        pin_memory=True, num_workers=1
    )

test_queue = torch.utils.data.DataLoader(
        test_data, batch_size=32,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(None),
        pin_memory=True, num_workers=1
    )

Each time I load data from the queue, I will specify the indices of the samples by set_tasks

def set_task(n_way, candidate_set, train_data, valid_data, test_data, train_queue, valid_queue, test_queue):
    task = np.random.choice(candidate_set, n_way, replace=False).tolist()
    train_indices = sum([train_data.maps[c] for c in task], [])
    valid_indices = sum([valid_data.maps[c] for c in task], [])
    test_indices = sum([test_data.maps[c] for c in task], [])
    train_queue.sampler.indices = train_indices
    valid_queue.sampler.indices = valid_indices
    test_queue.sampler.indices = test_indices
    hashmap = dict([(v, k) for k, v in enumerate(task)])
    return task, hashmap

To reproduce the error, just run

i = 0
while True:
    set_task(3, [1,2,3,4,5], train_data, valid_data, test_data, train_queue, valid_queue, test_queue)
    for _ in range(100):
        input, target = next(iter(train_queue))
    i += 1
    print i

And after several epochs (not always the same epoch), the error happens:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-15-02d188eb8764> in <module>()
      6     set_task(3, [1,2,3,4,5], train_data, valid_data, test_data, train_queue, valid_queue, test_queue)
      7     for _ in range(100):
----> 8         input, target = next(iter(train_queue))
      9     i += 1
     10     print i

/Users/yin.zheng/anaconda/lib/python2.7/site-packages/torch/utils/data/dataloader.pyc in __next__(self)
    278         while True:
    279             assert (not self.shutdown and self.batches_outstanding > 0)
--> 280             idx, batch = self._get_batch()
    281             self.batches_outstanding -= 1
    282             if idx != self.rcvd_idx:

/Users/yin.zheng/anaconda/lib/python2.7/site-packages/torch/utils/data/dataloader.pyc in _get_batch(self)
    257                 raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout))
    258         else:
--> 259             return self.data_queue.get()
    260 
    261     def __next__(self):

/Users/yin.zheng/anaconda/lib/python2.7/multiprocessing/queues.pyc in get()
    376             racquire()
    377             try:
--> 378                 return recv()
    379             finally:
    380                 rrelease()

/Users/yin.zheng/anaconda/lib/python2.7/site-packages/torch/multiprocessing/queue.pyc in recv(self)
     20     def recv(self):
     21         buf = self.recv_bytes()
---> 22         return pickle.loads(buf)
     23 
     24     def __getattr__(self, name):

/Users/yin.zheng/anaconda/lib/python2.7/pickle.pyc in loads(str)
   1386 def loads(str):
   1387     file = StringIO(str)
-> 1388     return Unpickler(file).load()
   1389 
   1390 # Doctest

/Users/yin.zheng/anaconda/lib/python2.7/pickle.pyc in load(self)
    862             while 1:
    863                 key = read(1)
--> 864                 dispatch[key](self)
    865         except _Stop, stopinst:
    866             return stopinst.value

/Users/yin.zheng/anaconda/lib/python2.7/pickle.pyc in load_reduce(self)
   1137         args = stack.pop()
   1138         func = stack[-1]
-> 1139         value = func(*args)
   1140         stack[-1] = value
   1141     dispatch[REDUCE] = load_reduce

/Users/yin.zheng/anaconda/lib/python2.7/site-packages/torch/multiprocessing/reductions.pyc in rebuild_storage_filename(cls, manager, handle, size)
     84     if storage is not None:
     85         return storage._shared_decref()
---> 86     storage = cls._new_shared_filename(manager, handle, size)
     87     shared_cache[handle] = storage._weak_ref(StorageRef)
     88     return storage._shared_decref()

RuntimeError: Interrupted system call at /Users/soumith/code/builder/wheel/pytorch-src/torch/lib/libshm/core.cpp:125

Can you help me to solve this problem? Thanks very much!

Yin

SimonW · November 22, 2018, 7:44am

Filed a bug on this at https://github.com/pytorch/pytorch/issues/14314. For now you can use num_workers=0 to work around it…

Yin_Zheng · November 22, 2018, 9:51am

Thanks! So what is the reason to cause this problem? And Why do you think num_works=0 will be a work around?

SimonW · November 22, 2018, 10:42am

It’s because of that libshm doesn’t retry on interrupted syscall. libshm is only used when num_workers > 0.

Yin_Zheng · November 22, 2018, 11:47am

Thanks and hope the bug will be fixed soon.