Hi,
The “RuntimeError: Interrupted system call at /Users/soumith/code/builder/wheel/pytorch-src/torch/lib/libshm/core.cpp:125” always happens.
Could you please help me to solve this problem? The details to reproduce the code is as follows:
DETAILS:
I have a dataset MC, which is defined as:
class MC(data.Dataset):
base_folder = 'mc'
train_data = 'train'
valid_data = 'valid'
test_data = 'test'
def __init__(self, root, mode=0, random_seed=1234, transform=None,
target_transform=None
):
self.root = os.path.expanduser(root)
self.transform = transform
self.target_transform = target_transform
self.mode = mode
assert mode in [0, 1, 2] # 0: train_set, 1: valid_set, 2:test_set
if mode == 0:
f = self.train_data
elif mode == 1:
f = self.valid_data
else:
f = self.test_data
random.seed(random_seed)
# now load the picked numpy arrays
self.maps = {} # the indices for each classes
file = os.path.join(self.root, self.base_folder, f)
fo = open(file, 'rb')
if sys.version_info[0] == 2:
entry = pickle.load(fo)
else:
entry = pickle.load(fo, encoding='latin1')
self.data = entry['data']
self.labels = entry['labels']
fo.close()
self.data = self.data.reshape((-1, 3, 32, 32))
self.data = self.data.transpose((0, 2, 3, 1))
class_ids = list(set(self.labels))
labels = np.array(self.labels)
for c_id in class_ids:
ids = np.where(labels == c_id)[0].tolist()
random.shuffle(ids)
self.maps[c_id] = ids
def __getitem__(self, index):
"""
Args:
index (int): Index
Returns:
tuple: (image, target) where target is index of the target class.
"""
img, target = self.data[index], self.labels[index]
# doing this so that it is consistent with all other datasets
# to return a PIL Image
img = Image.fromarray(img)
if self.transform is not None:
img = self.transform(img)
if self.target_transform is not None:
target = self.target_transform(target)
return img, target
def __len__(self):
return len(self.data)
def __repr__(self):
fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
fmt_str += ' Number of datapoints: {}\n'.format(self.__len__())
if self.mode == 0:
tmp = 'train'
elif self.mode == 1:
tmp = 'valid'
else:
tmp = 'test'
fmt_str += ' Split: {}\n'.format(tmp)
fmt_str += ' Root Location: {}\n'.format(self.root)
tmp = ' Transforms (if any): '
fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
tmp = ' Target Transforms (if any): '
fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
return fmt_str
And I have 3 queues, ‘train_queue’, ‘valid_queue’ and ‘test_queue’ defined as follows:
import dataset.mc as dset
import utils
import torch
import torchvision.transforms as transforms
import numpy as np
CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
])
train_data = dset.MC(root="../data", mode=0, transform=train_transform)
valid_data = dset.MC(root="../data", mode=1, transform=train_transform)
test_data = dset.MC(root="../data", mode=2, transform=train_transform)
train_queue = torch.utils.data.DataLoader(
train_data, batch_size=32,
sampler=torch.utils.data.sampler.SubsetRandomSampler(None),
pin_memory=True, num_workers=1
)
valid_queue = torch.utils.data.DataLoader(
valid_data, batch_size=32,
sampler=torch.utils.data.sampler.SubsetRandomSampler(None),
pin_memory=True, num_workers=1
)
test_queue = torch.utils.data.DataLoader(
test_data, batch_size=32,
sampler=torch.utils.data.sampler.SubsetRandomSampler(None),
pin_memory=True, num_workers=1
)
Each time I load data from the queue, I will specify the indices of the samples by set_tasks
def set_task(n_way, candidate_set, train_data, valid_data, test_data, train_queue, valid_queue, test_queue):
task = np.random.choice(candidate_set, n_way, replace=False).tolist()
train_indices = sum([train_data.maps[c] for c in task], [])
valid_indices = sum([valid_data.maps[c] for c in task], [])
test_indices = sum([test_data.maps[c] for c in task], [])
train_queue.sampler.indices = train_indices
valid_queue.sampler.indices = valid_indices
test_queue.sampler.indices = test_indices
hashmap = dict([(v, k) for k, v in enumerate(task)])
return task, hashmap
To reproduce the error, just run
i = 0
while True:
set_task(3, [1,2,3,4,5], train_data, valid_data, test_data, train_queue, valid_queue, test_queue)
for _ in range(100):
input, target = next(iter(train_queue))
i += 1
print i
And after several epochs (not always the same epoch), the error happens:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-15-02d188eb8764> in <module>()
6 set_task(3, [1,2,3,4,5], train_data, valid_data, test_data, train_queue, valid_queue, test_queue)
7 for _ in range(100):
----> 8 input, target = next(iter(train_queue))
9 i += 1
10 print i
/Users/yin.zheng/anaconda/lib/python2.7/site-packages/torch/utils/data/dataloader.pyc in __next__(self)
278 while True:
279 assert (not self.shutdown and self.batches_outstanding > 0)
--> 280 idx, batch = self._get_batch()
281 self.batches_outstanding -= 1
282 if idx != self.rcvd_idx:
/Users/yin.zheng/anaconda/lib/python2.7/site-packages/torch/utils/data/dataloader.pyc in _get_batch(self)
257 raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout))
258 else:
--> 259 return self.data_queue.get()
260
261 def __next__(self):
/Users/yin.zheng/anaconda/lib/python2.7/multiprocessing/queues.pyc in get()
376 racquire()
377 try:
--> 378 return recv()
379 finally:
380 rrelease()
/Users/yin.zheng/anaconda/lib/python2.7/site-packages/torch/multiprocessing/queue.pyc in recv(self)
20 def recv(self):
21 buf = self.recv_bytes()
---> 22 return pickle.loads(buf)
23
24 def __getattr__(self, name):
/Users/yin.zheng/anaconda/lib/python2.7/pickle.pyc in loads(str)
1386 def loads(str):
1387 file = StringIO(str)
-> 1388 return Unpickler(file).load()
1389
1390 # Doctest
/Users/yin.zheng/anaconda/lib/python2.7/pickle.pyc in load(self)
862 while 1:
863 key = read(1)
--> 864 dispatch[key](self)
865 except _Stop, stopinst:
866 return stopinst.value
/Users/yin.zheng/anaconda/lib/python2.7/pickle.pyc in load_reduce(self)
1137 args = stack.pop()
1138 func = stack[-1]
-> 1139 value = func(*args)
1140 stack[-1] = value
1141 dispatch[REDUCE] = load_reduce
/Users/yin.zheng/anaconda/lib/python2.7/site-packages/torch/multiprocessing/reductions.pyc in rebuild_storage_filename(cls, manager, handle, size)
84 if storage is not None:
85 return storage._shared_decref()
---> 86 storage = cls._new_shared_filename(manager, handle, size)
87 shared_cache[handle] = storage._weak_ref(StorageRef)
88 return storage._shared_decref()
RuntimeError: Interrupted system call at /Users/soumith/code/builder/wheel/pytorch-src/torch/lib/libshm/core.cpp:125
Can you help me to solve this problem? Thanks very much!
Yin