I used pytorch to train a cnn+ctc OCR model whose input images with increasing width size. Every time at 95-99% of first epoch, the system crashed with little information (Segmentation fault). I am sure the GPU and CPU memory were enough. I used gdb to debug, and infos show below. Can anyone has the same issue? I always think it’s the problem with torch.utils.data.DataLoader. Wired things: If I reduce the size of training data from 3000000 to 50000 with changing size, it works well, only sometimes output
connectionrefusederror: [errno 111] connection refused.
newest: i just try my dataloader, it outputs the same issue!!! If i reduce the size of samples from 3000000 to 20000, it works well
my pytorch version is
python 3.6
torch 0.4.1
torchvision 0.2.1
warpctc-pytorch 0.1
my debug code is :
class resizeNormalize(object):
def __init__(self, size, interpolation=Image.ANTIALIAS):
self.size = size
self.interpolation = interpolation
self.toTensor = transforms.ToTensor()
def __call__(self, img):
img = img.resize(self.size, self.interpolation)
img = self.toTensor(img)
img.sub_(0.5).div_(0.5)
return img
class resizePasteNormalize(object):
def __init__(self, imgH=32, img_bg=None, is_trainning=True, interpolation=Image.ANTIALIAS):
self.imgH = imgH
self.img_bg = img_bg
self.is_trainning = is_trainning
self.interpolation = interpolation
self.toTensor = transforms.ToTensor()
def __call__(self, img):
w, h = img.size
img = img.resize((int(self.imgH * w / h), self.imgH), self.interpolation)
if self.is_trainning:
new_img_bg = self.img_bg.copy()
new_img_bg.paste(img, (0, 0))
img = new_img_bg
img = self.toTensor(img)
img.sub_(0.5).div_(0.5)
return img
class alignCollate(object):
def __init__(self, imgH=32, imgW=128, keep_ratio=False, min_ratio=1):
self.imgH = imgH
self.imgW = imgW
self.keep_ratio = keep_ratio
self.min_ratio = min_ratio
def __call__(self, batch):
images, labels = zip(*batch)
imgH = self.imgH
imgW = self.imgW
if self.keep_ratio:
ratios = []
for image in images:
w, h = image.size
ratios.append(w / float(h))
ratios.sort()
num_ratios = len(ratios)
choose_ratio = ratios[int(2 * num_ratios / 3)]
imgW = int(np.floor(choose_ratio * imgH))
imgW = max(imgH * self.min_ratio, imgW) # ensure imgW >= imgH
transform_1 = resizeNormalize((imgW, imgH))
new_img = Image.new('RGB', (imgW, imgH), (0, 0, 0))
transform_2 = resizePasteNormalize(imgH, new_img)
ratios = []
new_images = []
for image in images:
w = image.size[0]
if w >= imgW:
ratios.append(1.0)
new_images += [transform_1(image)]
else:
ratios.append(w / imgW)
new_images += [transform_2(image)]
images = torch.cat([t.unsqueeze(0) for t in new_images], 0)
return ratios, images, labels
class randomSubsetSampler(sampler.Sampler):
def __init__(self, data_source):
self.word_len_list = data_source.word_len_list
self.num_samples = len(data_source)
def __iter__(self):
np_idx = []
prev_count = 0
for word_len in self.word_len_list:
subidx = list(range(word_len))
random.shuffle(subidx)
np_idx += [i + prev_count for i in subidx]
prev_count += word_len
index = torch.IntTensor(np_idx)
return iter(index)
def __len__(self):
return self.num_samples
class seqDataset(Dataset):
def __init__(self, root=None, sort_seq=False, min_chars=1, max_chars=60, transform=None, target_transform=None):
if not os.path.exists(root):
raise Exception("root not exist")
table_list = os.listdir(root)
self.img_path = [] # len(self.img_path)=0
self.text = []
self.word_len_list = []
for table_name in table_list:
if 'txt' not in table_name:
continue
table_path = os.path.join(root, table_name)
with open(table_path, 'r', encoding='utf8') as f:
lines = list(f) # len(lines)=300
for line in lines:
img_path, text = line.split(' ', 1)
img_path = img_path.strip()
text = text.strip()
if len(text) > max_chars or len(text) < min_chars:
continue
self.img_path.append(img_path)
self.text.append(text)
if sort_seq == True:
combine = zip(self.img_path, self.text)
self.img_path, self.text = zip(*(sorted(combine, key=lambda x: len(x[1]))))
assert len(self.img_path) == len(self.text)
self.nSamples = len(self.text)
len_dict = {}
for text in self.text:
len_dict[len(text)] = len_dict.get(len(text), 0) + 1
self.word_len_list = list(len_dict.values())
assert self.nSamples == sum(self.word_len_list)
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return self.nSamples
def __getitem__(self, index):
assert index <= len(self), 'index range error'
try:
img = Image.open(self.img_path[index]).convert('RGB')
except IOError:
print('Corrupted image for' + self.img_path[index])
return self[index + 1]
if self.transform is not None:
img = self.transform(img)
label = self.text[index]
if self.target_transform is not None:
label = self.target_transform(label)
return (img, label)
shuffle = False
sampler = exp_dataset.randomSubsetSampler(train_dataset)
if __name__ == '__main__':
root = './tables'
reader = seqDataset(root, sort_seq=True)
train_loader = torch.utils.data.DataLoader(
reader, batch_size=128,
shuffle=False, sampler=randomSubsetSampler(reader),
num_workers=16, drop_last=True,
collate_fn=alignCollate(imgH=32, keep_ratio=True))
print('dataset size:', len(reader))
train_iter = iter(train_loader)
i = 0
print(len(train_loader))
while i < len(train_loader):
print(i)
ratios, image, label = train_iter.next()
print(ratios)
i += 1
train_iter = iter(train_loader)
i = 0
print(len(train_loader))
while i < len(train_loader):
print(i)
ratios, image, label = train_iter.next()
i += 1
Output
c = Client(address, authkey=process.current_process().authkey)
File "/usr/local/python3/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
File "/usr/local/python3/lib/python3.6/multiprocessing/connection.py", line 487, in Client
c = SocketClient(address)
File "/usr/local/python3/lib/python3.6/multiprocessing/connection.py", line 614, in SocketClient
s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/local/python3/lib/python3.6/multiprocessing/connection.py", line 614, in SocketClient
s.connect(address)
ConnectionResetError: [Errno 104] Connection reset by peer
ConnectionRefusedError: [Errno 111] Connection refused
ConnectionRefusedError: [Errno 111] Connection refused
Segmentation fault
and gdb debug infos:
[0/1000][93154/93750] Train loss: 1.980 , accuray: 0.723 , cost time: 0.218
[0/1000][93248/93750] Train loss: 1.930 , accuray: 0.733 , cost time: 0.229
[0/1000][93342/93750] Train loss: 2.021 , accuray: 0.740 , cost time: 0.219
[0/1000][93436/93750] Train loss: 1.804 , accuray: 0.737 , cost time: 0.223
[0/1000][93530/93750] Train loss: 1.948 , accuray: 0.746 , cost time: 0.234
[0/1000][93624/93750] Train loss: 2.106 , accuray: 0.714 , cost time: 0.235
[0/1000][93718/93750] Train loss: 2.136 , accuray: 0.738 , cost time: 0.237
Program received signal SIGSEGV, Segmentation fault.
lookdict_unicode (mp=0x7ffff6384240, key=0x7ffff31725e0,
hash=-6490995071133369473, value_addr=0x7fffff7ff060, hashpos=0x0)
at Objects/dictobject.c:804
804 {
Missing separate debuginfos, use: debuginfo-install bzip2-libs-1.0.6-13.el7.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.13.2-12.el7_2.x86_64 libcom_err-1.42.9-7.el7.x86_64 libgcc-4.8.5-4.el7.x86_64 libselinux-2.2.2-6.el7.x86_64 libstdc++-4.8.5-4.el7.x86_64 libuuid-2.23.2-26.el7_2.3.x86_64 openssl-libs-1.0.1e-51.el7_2.7.x86_64 pcre-8.32-15.el7_2.1.x86_64 xz-libs-5.1.2-12alpha.el7.x86_64 zlib-1.2.7-15.el7.x86_64
#0 lookdict_unicode (mp=0x7ffff6384240, key=0x7ffff31725e0,
hash=-6490995071133369473, value_addr=0x7fffff7ff060, hashpos=0x0)
at Objects/dictobject.c:804
#1 0x000000000049be26 in PyDict_GetItem (op=op@entry=0x7ffff6384240,
key=key@entry=0x7ffff31725e0) at Objects/dictobject.c:1439
#2 0x00000000004ad303 in _PyObject_GenericGetAttrWithDict (
dict=0x7ffff6384240, name=0x7ffff31725e0,
name@entry=0x88bf00 <PyModule_Type>, obj=0x7ffff63859f8, obj@entry=0x0)
at Objects/object.c:1089
#3 PyObject_GenericGetAttr (obj=obj@entry=0x7ffff63859f8,
name=name@entry=0x7ffff31725e0) at Objects/object.c:1121
#4 0x00000000004a96bf in module_getattro (m=0x7ffff63859f8,
name=0x7ffff31725e0) at Objects/moduleobject.c:663
#5 0x0000000000544f37 in _PyEval_EvalFrameDefault (f=<optimized out>,
throwflag=<optimized out>) at Python/ceval.c:2832
#6 0x000000000053eef1 in PyEval_EvalFrameEx (throwflag=0, f=0x7fff03883708)
at Python/ceval.c:718
#7 _PyFunction_FastCall (co=co@entry=0x7fffe581bdb0, args=<optimized out>,
args@entry=0x7fffff7ff3c0, nargs=nargs@entry=1,
globals=globals@entry=0x7fffe5820bd0) at Python/ceval.c:4891
#8 0x0000000000548c96 in _PyFunction_FastCallDict (
func=func@entry=0x7fffe5822c80, args=args@entry=0x7fffff7ff3c0,
nargs=nargs@entry=1, kwargs=kwargs@entry=0x0) at Python/ceval.c:4993
#9 0x000000000045135f in _PyObject_FastCallDict (func=0x7fffe5822c80,
args=0x7fffff7ff3c0, nargs=1, kwargs=0x0) at Objects/abstract.c:2295
#10 0x000000000045145b in _PyObject_Call_Prepend (func=0x7fffe5822c80,
obj=0x7ffe95b3e3c8, args=0x7ffff7fa5048, kwargs=0x0)
at Objects/abstract.c:2358
#11 0x0000000000451212 in _PyObject_FastCallDict (func=0x7fffd90dcfc8,
args=0x0, nargs=nargs@entry=0, kwargs=kwargs@entry=0x0)
at Objects/abstract.c:2316
#12 0x0000000000540e6e in PyEval_CallObjectWithKeywords (
func=func@entry=0x7fffd90dcfc8, args=args@entry=0x0,
kwargs=kwargs@entry=0x0) at Python/ceval.c:4726
#13 0x00000000004c71e1 in slot_tp_finalize (self=0x7ffe95b3e3c8)
at Objects/typeobject.c:6450
#14 0x00000000004ab3a1 in PyObject_CallFinalizer (self=0x7ffe95b3e3c8)
at Objects/object.c:297
#15 PyObject_CallFinalizerFromDealloc (self=self@entry=0x7ffe95b3e3c8)
at Objects/object.c:314
#16 0x00000000004bae4a in subtype_dealloc (self=0x7ffe95b3e3c8)
at Objects/typeobject.c:1151
#17 0x00007fff73284f62 in operator= (new_ptr=0x0, this=0x7ffed4a2b0d8)
at /pytorch/torch/csrc/utils/object_ptr.h:17
#18 torch::PyObjectFinalizer::~PyObjectFinalizer (this=0x7ffed4a2b0d0,
__in_chrg=<optimized out>) at /pytorch/torch/csrc/finalizer.h:26
#19 0x00007fff73285084 in ~PyObjectFinalizer (this=0x7ffed4a2b0d0,
__in_chrg=<optimized out>) at /pytorch/torch/csrc/finalizer.h:27
#20 operator() (this=<optimized out>, __ptr=0x7ffed4a2b0d0)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#21 ~unique_ptr (this=0x7ffed4a2b100, __in_chrg=<optimized out>)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#22 ~PyObjectFinalizer (this=0x7ffed4a2b0f0, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:22
#23 ~PyObjectFinalizer (this=0x7ffed4a2b0f0, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:27
#24 operator() (this=<optimized out>, __ptr=0x7ffed4a2b0f0)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#25 ~unique_ptr (this=0x7ffed4a2b100, __in_chrg=<optimized out>)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#26 ~PyObjectFinalizer (this=0x7ffed4a2b110, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:22
#27 ~PyObjectFinalizer (this=0x7ffed4a2b110, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:27
#28 operator() (this=<optimized out>, __ptr=0x7ffed4a2b110)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#29 ~unique_ptr (this=0x7ffed4a2b100, __in_chrg=<optimized out>)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#30 ~PyObjectFinalizer (this=0x7ffed4a2b130, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:22
#31 ~PyObjectFinalizer (this=0x7ffed4a2b130, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:27
#32 operator() (this=<optimized out>, __ptr=0x7ffed4a2b130)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#33 ~unique_ptr (this=0x7ffed4a2b100, __in_chrg=<optimized out>)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#34 torch::PyObjectFinalizer::~PyObjectFinalizer (this=0x7ffed4a2b150,
__in_chrg=<optimized out>) at /pytorch/torch/csrc/finalizer.h:22
#35 0x00007fff73285084 in ~PyObjectFinalizer (this=0x7ffed4a2b150,
__in_chrg=<optimized out>) at /pytorch/torch/csrc/finalizer.h:27
#36 operator() (this=<optimized out>, __ptr=0x7ffed4a2b150)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#37 ~unique_ptr (this=0x7ffed4a2b180, __in_chrg=<optimized out>)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#38 ~PyObjectFinalizer (this=0x7ffed4a2b170, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:22
#39 ~PyObjectFinalizer (this=0x7ffed4a2b170, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:27
#40 operator() (this=<optimized out>, __ptr=0x7ffed4a2b170)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#41 ~unique_ptr (this=0x7ffed4a2b180, __in_chrg=<optimized out>)
at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#42 ~PyObjectFinalizer (this=0x7ffed4a2b190, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:22
#43 ~PyObjectFinalizer (this=0x7ffed4a2b190, __in_chrg=<optimized out>)
at /pytorch/torch/csrc/finalizer.h:27