Segmentation fault

perry_wu · August 20, 2018, 7:51am

I used pytorch to train a cnn+ctc OCR model whose input images with increasing width size. Every time at 95-99% of first epoch, the system crashed with little information (Segmentation fault). I am sure the GPU and CPU memory were enough. I used gdb to debug, and infos show below. Can anyone has the same issue? I always think it’s the problem with torch.utils.data.DataLoader. Wired things: If I reduce the size of training data from 3000000 to 50000 with changing size, it works well, only sometimes output
connectionrefusederror: [errno 111] connection refused.

newest: i just try my dataloader, it outputs the same issue!!! If i reduce the size of samples from 3000000 to 20000, it works well

my pytorch version is
python 3.6
torch 0.4.1
torchvision 0.2.1
warpctc-pytorch 0.1

my debug code is :

class resizeNormalize(object):

    def __init__(self, size, interpolation=Image.ANTIALIAS):
        self.size = size
        self.interpolation = interpolation
        self.toTensor = transforms.ToTensor()

    def __call__(self, img):
        img = img.resize(self.size, self.interpolation)
        img = self.toTensor(img)
        img.sub_(0.5).div_(0.5)
        return img

class resizePasteNormalize(object):

    def __init__(self, imgH=32, img_bg=None, is_trainning=True, interpolation=Image.ANTIALIAS):
        self.imgH = imgH
        self.img_bg = img_bg
        self.is_trainning = is_trainning
        self.interpolation = interpolation
        self.toTensor = transforms.ToTensor()

    def __call__(self, img):
        w, h = img.size
        img = img.resize((int(self.imgH * w / h), self.imgH), self.interpolation)
        if self.is_trainning:
            new_img_bg = self.img_bg.copy()
            new_img_bg.paste(img, (0, 0))
            img = new_img_bg
        img = self.toTensor(img)
        img.sub_(0.5).div_(0.5)
        return img

class alignCollate(object):

    def __init__(self, imgH=32, imgW=128, keep_ratio=False, min_ratio=1):
        self.imgH = imgH
        self.imgW = imgW
        self.keep_ratio = keep_ratio
        self.min_ratio = min_ratio

    def __call__(self, batch):
        images, labels = zip(*batch)

        imgH = self.imgH
        imgW = self.imgW
        if self.keep_ratio:
            ratios = []
            for image in images:
                w, h = image.size
                ratios.append(w / float(h))
            ratios.sort()
            num_ratios = len(ratios)
            choose_ratio = ratios[int(2 * num_ratios / 3)]
            imgW = int(np.floor(choose_ratio * imgH))
            imgW = max(imgH * self.min_ratio, imgW)  # ensure imgW >= imgH

        transform_1 = resizeNormalize((imgW, imgH))
        new_img = Image.new('RGB', (imgW, imgH), (0, 0, 0))
        transform_2 = resizePasteNormalize(imgH, new_img)

        ratios = []
        new_images = []
        for image in images:
            w = image.size[0]
            if w >= imgW:
                ratios.append(1.0)
                new_images += [transform_1(image)]
            else:
                ratios.append(w / imgW)
                new_images += [transform_2(image)]

        images = torch.cat([t.unsqueeze(0) for t in new_images], 0)
        return ratios, images, labels


class randomSubsetSampler(sampler.Sampler):

    def __init__(self, data_source):
        self.word_len_list = data_source.word_len_list
        self.num_samples = len(data_source)

    def __iter__(self):
        np_idx = []
        prev_count = 0
        for word_len in self.word_len_list:
            subidx = list(range(word_len))
            random.shuffle(subidx)
            np_idx += [i + prev_count for i in subidx]
            prev_count += word_len
        index = torch.IntTensor(np_idx)

        return iter(index)

    def __len__(self):
        return self.num_samples

class seqDataset(Dataset):

    def __init__(self, root=None, sort_seq=False, min_chars=1, max_chars=60, transform=None, target_transform=None):

        if not os.path.exists(root):
            raise Exception("root not exist")
        table_list = os.listdir(root)

        self.img_path = []  # len(self.img_path)=0
        self.text = []
        self.word_len_list = []

        for table_name in table_list:
            if 'txt' not in table_name:
                continue
            table_path = os.path.join(root, table_name)
            with open(table_path, 'r', encoding='utf8') as f:
                lines = list(f)  # len(lines)=300
                for line in lines:
                    img_path, text = line.split(' ', 1)
                    img_path = img_path.strip()
                    text = text.strip()
                    if len(text) > max_chars or len(text) < min_chars:
                        continue
                    self.img_path.append(img_path)
                    self.text.append(text)

        if sort_seq == True:
            combine = zip(self.img_path, self.text)
            self.img_path, self.text = zip(*(sorted(combine, key=lambda x: len(x[1]))))

        assert len(self.img_path) == len(self.text)

        self.nSamples = len(self.text)

        len_dict = {}
        for text in self.text:
            len_dict[len(text)] = len_dict.get(len(text), 0) + 1
        self.word_len_list = list(len_dict.values())

        assert self.nSamples == sum(self.word_len_list)

        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return self.nSamples

    def __getitem__(self, index):
        assert index <= len(self), 'index range error'
        try:
            img = Image.open(self.img_path[index]).convert('RGB')
        except IOError:
            print('Corrupted image for' + self.img_path[index])
            return self[index + 1]

        if self.transform is not None:
            img = self.transform(img)

        label = self.text[index]

        if self.target_transform is not None:
            label = self.target_transform(label)

        return (img, label)


    shuffle = False
    sampler = exp_dataset.randomSubsetSampler(train_dataset)

if __name__ == '__main__':
    root = './tables'
    reader = seqDataset(root, sort_seq=True)
    train_loader = torch.utils.data.DataLoader(
        reader, batch_size=128,
        shuffle=False, sampler=randomSubsetSampler(reader),
        num_workers=16, drop_last=True,
        collate_fn=alignCollate(imgH=32, keep_ratio=True))
    print('dataset size:', len(reader))

    train_iter = iter(train_loader)
    i = 0
    print(len(train_loader))
    while i < len(train_loader):
        print(i)
        ratios, image, label = train_iter.next()
        print(ratios)
        i += 1

    train_iter = iter(train_loader)
    i = 0
    print(len(train_loader))
    while i < len(train_loader):
        print(i)
        ratios, image, label = train_iter.next()
        i += 1

Output

    c = Client(address, authkey=process.current_process().authkey)
  File "/usr/local/python3/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
  File "/usr/local/python3/lib/python3.6/multiprocessing/connection.py", line 487, in Client
    c = SocketClient(address)
  File "/usr/local/python3/lib/python3.6/multiprocessing/connection.py", line 614, in SocketClient
    s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
  File "/usr/local/python3/lib/python3.6/multiprocessing/connection.py", line 614, in SocketClient
    s.connect(address)
ConnectionResetError: [Errno 104] Connection reset by peer
ConnectionRefusedError: [Errno 111] Connection refused
ConnectionRefusedError: [Errno 111] Connection refused
Segmentation fault

and gdb debug infos:

[0/1000][93154/93750] Train loss: 1.980 , accuray: 0.723 , cost time: 0.218
[0/1000][93248/93750] Train loss: 1.930 , accuray: 0.733 , cost time: 0.229
[0/1000][93342/93750] Train loss: 2.021 , accuray: 0.740 , cost time: 0.219
[0/1000][93436/93750] Train loss: 1.804 , accuray: 0.737 , cost time: 0.223
[0/1000][93530/93750] Train loss: 1.948 , accuray: 0.746 , cost time: 0.234
[0/1000][93624/93750] Train loss: 2.106 , accuray: 0.714 , cost time: 0.235
[0/1000][93718/93750] Train loss: 2.136 , accuray: 0.738 , cost time: 0.237

Program received signal SIGSEGV, Segmentation fault.
lookdict_unicode (mp=0x7ffff6384240, key=0x7ffff31725e0,
    hash=-6490995071133369473, value_addr=0x7fffff7ff060, hashpos=0x0)
    at Objects/dictobject.c:804
804	{
Missing separate debuginfos, use: debuginfo-install bzip2-libs-1.0.6-13.el7.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.13.2-12.el7_2.x86_64 libcom_err-1.42.9-7.el7.x86_64 libgcc-4.8.5-4.el7.x86_64 libselinux-2.2.2-6.el7.x86_64 libstdc++-4.8.5-4.el7.x86_64 libuuid-2.23.2-26.el7_2.3.x86_64 openssl-libs-1.0.1e-51.el7_2.7.x86_64 pcre-8.32-15.el7_2.1.x86_64 xz-libs-5.1.2-12alpha.el7.x86_64 zlib-1.2.7-15.el7.x86_64

#0  lookdict_unicode (mp=0x7ffff6384240, key=0x7ffff31725e0,
    hash=-6490995071133369473, value_addr=0x7fffff7ff060, hashpos=0x0)
    at Objects/dictobject.c:804
#1  0x000000000049be26 in PyDict_GetItem (op=op@entry=0x7ffff6384240,
    key=key@entry=0x7ffff31725e0) at Objects/dictobject.c:1439
#2  0x00000000004ad303 in _PyObject_GenericGetAttrWithDict (
    dict=0x7ffff6384240, name=0x7ffff31725e0,
    name@entry=0x88bf00 <PyModule_Type>, obj=0x7ffff63859f8, obj@entry=0x0)
    at Objects/object.c:1089
#3  PyObject_GenericGetAttr (obj=obj@entry=0x7ffff63859f8,
    name=name@entry=0x7ffff31725e0) at Objects/object.c:1121
#4  0x00000000004a96bf in module_getattro (m=0x7ffff63859f8,
    name=0x7ffff31725e0) at Objects/moduleobject.c:663
#5  0x0000000000544f37 in _PyEval_EvalFrameDefault (f=<optimized out>,
    throwflag=<optimized out>) at Python/ceval.c:2832
#6  0x000000000053eef1 in PyEval_EvalFrameEx (throwflag=0, f=0x7fff03883708)
    at Python/ceval.c:718
#7  _PyFunction_FastCall (co=co@entry=0x7fffe581bdb0, args=<optimized out>,
    args@entry=0x7fffff7ff3c0, nargs=nargs@entry=1,
    globals=globals@entry=0x7fffe5820bd0) at Python/ceval.c:4891
#8  0x0000000000548c96 in _PyFunction_FastCallDict (
    func=func@entry=0x7fffe5822c80, args=args@entry=0x7fffff7ff3c0,
    nargs=nargs@entry=1, kwargs=kwargs@entry=0x0) at Python/ceval.c:4993
#9  0x000000000045135f in _PyObject_FastCallDict (func=0x7fffe5822c80,
   args=0x7fffff7ff3c0, nargs=1, kwargs=0x0) at Objects/abstract.c:2295
#10 0x000000000045145b in _PyObject_Call_Prepend (func=0x7fffe5822c80,
    obj=0x7ffe95b3e3c8, args=0x7ffff7fa5048, kwargs=0x0)
    at Objects/abstract.c:2358
#11 0x0000000000451212 in _PyObject_FastCallDict (func=0x7fffd90dcfc8,
    args=0x0, nargs=nargs@entry=0, kwargs=kwargs@entry=0x0)
    at Objects/abstract.c:2316
#12 0x0000000000540e6e in PyEval_CallObjectWithKeywords (
    func=func@entry=0x7fffd90dcfc8, args=args@entry=0x0,
    kwargs=kwargs@entry=0x0) at Python/ceval.c:4726
#13 0x00000000004c71e1 in slot_tp_finalize (self=0x7ffe95b3e3c8)
    at Objects/typeobject.c:6450
#14 0x00000000004ab3a1 in PyObject_CallFinalizer (self=0x7ffe95b3e3c8)
    at Objects/object.c:297
#15 PyObject_CallFinalizerFromDealloc (self=self@entry=0x7ffe95b3e3c8)
    at Objects/object.c:314
#16 0x00000000004bae4a in subtype_dealloc (self=0x7ffe95b3e3c8)
    at Objects/typeobject.c:1151
#17 0x00007fff73284f62 in operator= (new_ptr=0x0, this=0x7ffed4a2b0d8)
    at /pytorch/torch/csrc/utils/object_ptr.h:17
#18 torch::PyObjectFinalizer::~PyObjectFinalizer (this=0x7ffed4a2b0d0,
    __in_chrg=<optimized out>) at /pytorch/torch/csrc/finalizer.h:26
#19 0x00007fff73285084 in ~PyObjectFinalizer (this=0x7ffed4a2b0d0,
    __in_chrg=<optimized out>) at /pytorch/torch/csrc/finalizer.h:27
#20 operator() (this=<optimized out>, __ptr=0x7ffed4a2b0d0)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#21 ~unique_ptr (this=0x7ffed4a2b100, __in_chrg=<optimized out>)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#22 ~PyObjectFinalizer (this=0x7ffed4a2b0f0, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:22
#23 ~PyObjectFinalizer (this=0x7ffed4a2b0f0, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:27
#24 operator() (this=<optimized out>, __ptr=0x7ffed4a2b0f0)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#25 ~unique_ptr (this=0x7ffed4a2b100, __in_chrg=<optimized out>)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#26 ~PyObjectFinalizer (this=0x7ffed4a2b110, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:22
#27 ~PyObjectFinalizer (this=0x7ffed4a2b110, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:27
#28 operator() (this=<optimized out>, __ptr=0x7ffed4a2b110)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#29 ~unique_ptr (this=0x7ffed4a2b100, __in_chrg=<optimized out>)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#30 ~PyObjectFinalizer (this=0x7ffed4a2b130, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:22
#31 ~PyObjectFinalizer (this=0x7ffed4a2b130, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:27
#32 operator() (this=<optimized out>, __ptr=0x7ffed4a2b130)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#33 ~unique_ptr (this=0x7ffed4a2b100, __in_chrg=<optimized out>)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#34 torch::PyObjectFinalizer::~PyObjectFinalizer (this=0x7ffed4a2b150,
    __in_chrg=<optimized out>) at /pytorch/torch/csrc/finalizer.h:22
#35 0x00007fff73285084 in ~PyObjectFinalizer (this=0x7ffed4a2b150,
    __in_chrg=<optimized out>) at /pytorch/torch/csrc/finalizer.h:27
#36 operator() (this=<optimized out>, __ptr=0x7ffed4a2b150)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#37 ~unique_ptr (this=0x7ffed4a2b180, __in_chrg=<optimized out>)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#38 ~PyObjectFinalizer (this=0x7ffed4a2b170, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:22
#39 ~PyObjectFinalizer (this=0x7ffed4a2b170, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:27
#40 operator() (this=<optimized out>, __ptr=0x7ffed4a2b170)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:76
#41 ~unique_ptr (this=0x7ffed4a2b180, __in_chrg=<optimized out>)
    at /opt/rh/devtoolset-3/root/usr/include/c++/4.9.2/bits/unique_ptr.h:236
#42 ~PyObjectFinalizer (this=0x7ffed4a2b190, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:22
#43 ~PyObjectFinalizer (this=0x7ffed4a2b190, __in_chrg=<optimized out>)
    at /pytorch/torch/csrc/finalizer.h:27

y91 · August 23, 2018, 9:39pm

Hi perry_wu, I am also getting similar errors. ‘Segmentation fault’ and ‘connection reset by peer’. The errors originate from the pytorch Dataloader. In my case however, it happens when I am getting the very first minibatch. Just so that I understand, if you reduce the size of your training set, this error disappears?

ptrblck · August 23, 2018, 9:46pm

Could you check the size of your shared memory and increase it if possible?

@y91 Your issue sounds a bit different as the error is immediately thrown as far as I understand.

y91 · August 23, 2018, 9:58pm

Hi @ptrblck, yes, you got it correct. Another detail I forgot to mention is that this only happens in multi GPU training. Single GPU runs never fail.

Regarding shared memory, thank you for the suggestion. I am on it.

perry_wu · August 26, 2018, 6:23am

Hi,ptrblck, I solved this problem.
look into this code, and it is a function of a sampler of Dataloader. def iter(self) returns iter(torch.IntTensor(np_idx)), if I change this to iter(numpy(list)), it’s correct. old version shows returning iter(torch.tensor) is correct, but version 0.4.1shows segment fault when the size is huge.

class randomSubsetSampler(sampler.Sampler):

    def __init__(self, data_source):
        self.word_len_list = data_source.word_len_list
        self.num_samples = len(data_source)

    def __iter__(self):
        np_idx = []
        prev_count = 0
        for word_len in self.word_len_list:
            subidx = list(range(word_len))
            random.shuffle(subidx)
            np_idx += [i + prev_count for i in subidx]
            prev_count += word_len
        index = torch.IntTensor(np_idx)

        return iter(index)

    def __len__(self):
        return self.num_samples

bear_min · February 23, 2019, 10:56am

hey，I have met this problem recently. And I only use one GPU to train my model. When I make sampler as None in DataLoder, it works well. However, after I add WeightedRandomSampler, it always shows the ConnectionRefusedError when it was before the first epoch.
I have checked my dataset and divide it into two parts. Each part can go through the code.
But, when I set the num_worker as 0 , the whole dataset can also go through the code. I have doubt it was caused by the wrong setting of num_workers.

sirius · October 8, 2019, 12:43am

have you fixed the issue?

thejonan · February 8, 2020, 11:07pm

I’m using PyTorch 1.4 now, I still got the same error when using DataLoader. The crash is quite random - sometimes on the first epoch, sometimes after couple of them.

DeepakSaini119 · July 31, 2020, 2:20am

I face the same issue in Multi GPU training, which seems to work fine on single GPU or for smaller datasets on multiple GPUs but randomly just gives me Segmentation fault on larger datasets when I try to train on Multi GPUs. @y91, I know its been a long time but were you able to pin point it. Thanks.

hoangcuong2011 · August 16, 2020, 11:07pm

Hi, I faced the same issue with nn.DataParallel (Segmentation fault (core dumped)). I supposed there is no fix to this issue yet but please let me know if you have a workaround.

Aiman_Mutasem-bellh · November 16, 2020, 11:47pm

This issue is related to torch version 0.4.0, you need to update to the upper version like 0.5.0

Brando_Miranda · December 2, 2020, 5:40pm

I am also having this issue…my gpu test is very simple just passing data through resnet pytorch has:

#!/home/miranda9/.conda/envs/automl-meta-learning/bin/python3.7
#SBATCH --job-name="miranda9job"
#SBATCH --output="experiment_output_job.%j.%N.out"
#SBATCH --error="experiment_output_job.%j.%N.err"
#SBATCH --export=ALL
#SBATCH --gres=gpu:1
#SBATCH --mail-user=brando.science@gmail.com
#SBATCH --mail-type=ALL
#SBATCH --time=04:00:00
#SBATCH --partition=secondary-Eth

import torch


out_features = 5

# resnet 18 (for single gpu test)
net = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=False)
net.fc = torch.nn.Linear(in_features=512, out_features=out_features, bias=True)

# resnet 152 (for multi gpu test)
# net = torch.hub.load('pytorch/vision:v0.6.0', 'resnet152', pretrained=True)
# net.fc = torch.nn.Linear(in_features=2048, out_features=out_features, bias=True)
if torch.cuda.is_available():
    net = net.cuda()

print(type(net))

print(torch.cuda.device_count())
print(list(range(torch.cuda.device_count())))
if torch.cuda.device_count() > 1:
    # args.base_model = torch.nn.parallel.DistributedDataParallel(args.base_model, device_ids=list(range(torch.cuda.device_count())))
    net = torch.nn.DataParallel(net, device_ids=list(range(torch.cuda.device_count()))).cuda()

print(type(net))

batch_size = 1024
x = torch.randn(batch_size, 3, 84, 84).cuda()
y_pred = net(x)
print(y_pred.size())
y = torch.randn(batch_size, out_features).cuda()

print(y_pred.sum())

criterion = torch.nn.MSELoss()

loss = criterion(y_pred, y)
print(loss)

print('DONE')

the script says done but it also says segmentation error…:

(automl-meta-learning) miranda9~/automl-meta-learning $ python test_multiple_gpus.py 
Using cache found in /home/miranda9/.cache/torch/hub/pytorch_vision_v0.6.0
<class 'torchvision.models.resnet.ResNet'>
1
[0]
<class 'torchvision.models.resnet.ResNet'>
torch.Size([1024, 5])
tensor(-183.6620, device='cuda:0', grad_fn=<SumBackward0>)
tensor(1.1541, device='cuda:0', grad_fn=<MseLossBackward>)
DONE
Segmentation fault

weird hu?

Brando_Miranda · December 2, 2020, 5:42pm

hmmm I don’t see anything weird with my cuda versions of torch versions. Is the most recent one broken for you?

(automl-meta-learning) miranda9~/automl-meta-learning $ conda list
# packages in environment at /home/miranda9/miniconda3/envs/automl-meta-learning:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                        main  
absl-py                   0.11.0           py38h06a4308_0  
aiohttp                   3.7.2            py38h27cfd23_1  
anatome                   0.0.1                    pypi_0    pypi
async-timeout             3.0.1                    py38_0  
attrs                     20.3.0             pyhd3eb1b0_0  
beautifulsoup4            4.9.3              pyhb0f4dca_0  
blas                      1.0                         mkl  
blinker                   1.4                      py38_0  
brotlipy                  0.7.0           py38h27cfd23_1003  
bzip2                     1.0.8                h7b6447c_0  
c-ares                    1.17.1               h27cfd23_0  
ca-certificates           2020.10.14                    0  
cachetools                4.1.1                      py_0  
cairo                     1.14.12              h8948797_3  
certifi                   2020.11.8        py38h06a4308_0  
cffi                      1.14.0           py38h2e261b9_0  
chardet                   3.0.4           py38h06a4308_1003  
click                     7.1.2                      py_0  
conda                     4.9.2            py38h06a4308_0  
conda-build               3.20.5                   py38_1  
conda-package-handling    1.7.2            py38h03888b9_0  
cryptography              3.2.1            py38h3c74f83_1  
cudatoolkit               11.0.221             h6bb024c_0  
cycler                    0.10.0                   py38_0  
dataclasses               0.6                      pypi_0    pypi
dbus                      1.13.18              hb2f20db_0  
dill                      0.3.3              pyhd3eb1b0_0  
expat                     2.2.10               he6710b0_2  
filelock                  3.0.12                     py_0  
fontconfig                2.13.0               h9420a91_0  
freetype                  2.10.4               h5ab3b9f_0  
fribidi                   1.0.10               h7b6447c_0  
future                    0.18.2                   pypi_0    pypi
glib                      2.63.1               h5a9c865_0  
glob2                     0.7                        py_0  
google-auth               1.23.0             pyhd3eb1b0_0  
google-auth-oauthlib      0.4.2              pyhd3eb1b0_2  
graphite2                 1.3.14               h23475e2_0  
graphviz                  2.40.1               h21bd128_2  
grpcio                    1.31.0           py38hf8bcb03_0  
gst-plugins-base          1.14.0               hbbd80ab_1  
gstreamer                 1.14.0               hb31296c_0  
h5py                      3.1.0                    pypi_0    pypi
harfbuzz                  1.8.8                hffaf4a1_0  
higher                    0.2.1                    pypi_0    pypi
icu                       58.2                 he6710b0_3  
idna                      2.10                       py_0  
importlib-metadata        2.0.0                      py_1  
intel-openmp              2020.2                      254  
jinja2                    2.11.2                     py_0  
joblib                    0.17.0                     py_0  
jpeg                      9b                   h024ee3a_2  
kiwisolver                1.3.0            py38h2531618_0  
lcms2                     2.11                 h396b838_0  
ld_impl_linux-64          2.33.1               h53a641e_7  
libarchive                3.4.2                h62408e4_0  
libedit                   3.1.20191231         h14c3975_1  
libffi                    3.2.1             hf484d3e_1007  
libgcc-ng                 9.1.0                hdf63c60_0  
libgfortran-ng            7.3.0                hdf63c60_0  
liblief                   0.10.1               he6710b0_0  
libpng                    1.6.37               hbc83047_0  
libprotobuf               3.13.0.1             hd408876_0  
libstdcxx-ng              9.1.0                hdf63c60_0  
libtiff                   4.1.0                h2733197_1  
libuuid                   1.0.3                h1bed415_2  
libuv                     1.40.0               h7b6447c_0  
libxcb                    1.14                 h7b6447c_0  
libxml2                   2.9.10               hb55368b_3  
lz4-c                     1.9.2                heb0550a_3  
markdown                  3.3.3            py38h06a4308_0  
markupsafe                1.1.1            py38h7b6447c_0  
matplotlib                3.3.2                         0  
matplotlib-base           3.3.2            py38h817c723_0  
mkl                       2020.2                      256  
mkl-service               2.3.0            py38he904b0f_0  
mkl_fft                   1.2.0            py38h23d657b_0  
mkl_random                1.1.1            py38h0573a6f_0  
multidict                 4.7.6            py38h7b6447c_1  
ncurses                   6.2                  he6710b0_1  
ninja                     1.10.2           py38hff7bd54_0  
numpy                     1.19.2           py38h54aff64_0  
numpy-base                1.19.2           py38hfa32c7d_0  
oauthlib                  3.1.0                      py_0  
olefile                   0.46                       py_0  
openssl                   1.1.1h               h7b6447c_0  
ordered-set               4.0.2                    pypi_0    pypi
pandas                    1.1.3            py38he6710b0_0  
pango                     1.42.4               h049681c_0  
patchelf                  0.12                 he6710b0_0  
pcre                      8.44                 he6710b0_0  
pillow                    8.0.1            py38he98fc37_0  
pip                       20.3             py38h06a4308_0  
pixman                    0.40.0               h7b6447c_0  
pkginfo                   1.6.1            py38h06a4308_0  
protobuf                  3.13.0.1         py38he6710b0_1  
psutil                    5.7.2            py38h7b6447c_0  
py-lief                   0.10.1           py38h403a769_0  
pyasn1                    0.4.8                      py_0  
pyasn1-modules            0.2.8                      py_0  
pycosat                   0.6.3            py38h7b6447c_1  
pycparser                 2.20                       py_2  
pyjwt                     1.7.1                    py38_0  
pyopenssl                 20.0.0             pyhd3eb1b0_1  
pyparsing                 2.4.7                      py_0  
pyqt                      5.9.2            py38h05f1152_4  
pysocks                   1.7.1            py38h06a4308_0  
python                    3.8.2                hcf32534_0  
python-dateutil           2.8.1                      py_0  
python-libarchive-c       2.9                        py_0  
pytorch                   1.7.0           py3.8_cuda11.0.221_cudnn8.0.3_0    pytorch
pytz                      2020.4             pyhd3eb1b0_0  
pyyaml                    5.3.1            py38h7b6447c_1  
qt                        5.9.7                h5867ecd_1  
readline                  8.0                  h7b6447c_0  
requests                  2.25.0             pyhd3eb1b0_0  
requests-oauthlib         1.3.0                      py_0  
ripgrep                   12.1.1                        0  
rsa                       4.6                        py_0  
ruamel_yaml               0.15.87          py38h7b6447c_1  
scikit-learn              0.23.2           py38h0573a6f_0  
scipy                     1.5.2            py38h0b6359f_0  
setuptools                50.3.1           py38h06a4308_1  
sip                       4.19.13          py38he6710b0_0  
six                       1.15.0           py38h06a4308_0  
soupsieve                 2.0.1                      py_0  
sqlite                    3.33.0               h62c20be_0  
tensorboard               2.3.0              pyh4dce500_0  
tensorboard-plugin-wit    1.6.0                      py_0  
threadpoolctl             2.1.0              pyh5ca1d4c_0  
tk                        8.6.10               hbc83047_0  
torchaudio                0.7.0                      py38    pytorch
torchmeta                 1.6.1                    pypi_0    pypi
torchvision               0.8.1                py38_cu110    pytorch
tornado                   6.0.4            py38h7b6447c_1  
tqdm                      4.54.0                   pypi_0    pypi
typing-extensions         3.7.4.3                       0  
typing_extensions         3.7.4.3                    py_0  
urllib3                   1.26.2                   pypi_0    pypi
werkzeug                  1.0.1                      py_0  
wheel                     0.35.1             pyhd3eb1b0_0  
xz                        5.2.5                h7b6447c_0  
yaml                      0.2.5                h7b6447c_0  
yarl                      1.5.1            py38h7b6447c_0  
zipp                      3.4.0              pyhd3eb1b0_0  
zlib                      1.2.11               h7b6447c_3  
zstd                      1.4.5                h9ceee32_0

Aiman_Mutasem-bellh · December 3, 2020, 2:20am

I have fixed this issue by upgrading the Pytorch to the recent version.

ptrblck · December 3, 2020, 4:36am

What kind of GPUs are you using?

Brando_Miranda · January 7, 2021, 10:02pm

@ptrblck

What kind of GPUs are you using?

I am using a titan xp I believe.

---- running ----
is cuda available: True
device: cuda
/home/miranda9/data/dataset_LS_fully_connected_NN_with_BN_nb_tasks200_data_per_task1000_l_4_nb_h_layes3_out1_H15/meta_set_fully_connected_NN_with_BN_std1_0.01_std2_1.0_noise_std0.1nb_h_layes3_out1_H15
['fc1_l1', 'fc2_l1', 'fc3_l1']
number of workers = 0
--> args.meta_batch_size = 200
--> args.iters = 5
--> args.nb_inner_train_steps = 1
  0%|                                                                                                                                                                                                                                                                                                                                                                                                     | 0/5 [00:00<?, ?it/s]it = 0
Segmentation fault
(automl-meta-learning) miranda9~/automl-meta-learning/results_plots $ nvidia-smi
Thu Jan  7 15:59:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  TITAN Xp            Off  | 00000000:02:00.0 Off |                  N/A |
| 53%   84C    P2   253W / 250W |   9209MiB / 12196MiB |     84%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            Off  | 00000000:03:00.0 Off |                  N/A |
| 26%   43C    P0    63W / 250W |      0MiB / 12196MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  TITAN Xp            Off  | 00000000:82:00.0 Off |                  N/A |
| 51%   82C    P2   182W / 250W |  11931MiB / 12196MiB |     71%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  TITAN Xp            Off  | 00000000:83:00.0 Off |                  N/A |
| 26%   38C    P0    62W / 250W |      0MiB / 12196MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A     10994      C   python                           9207MiB |
|    2   N/A  N/A     30964      C   python                          11929MiB |
+-----------------------------------------------------------------------------+
(automl-meta-learning) miranda9~/automl-meta-learning/results_plots $

I don’t get it…what is causing the issue in the first place? I know segmentation faults are due to accessing or doing something illegal/not allowed with the memory. So is the error due to GPU memory or normal RAM or what is going on?

related link I found (not sure if it will help): ubuntu - Error in `python': free(): invalid pointer: 0x00007fc3c90dc98e - Stack Overflow

fyi this is the library causing my error:

github.com/moskomule/anatome

cca code for GPU code not working

opened 05:52PM - 02 Dec 20 UTC

closed 07:12AM - 28 Dec 20 UTC

brando90

small example: ``` import torch import torch.nn as nn from anatome import Si…milarityHook from collections import OrderedDict # Din, Dout = 1, 1 mdl1 = nn.Sequential(OrderedDict([ ('fc1_l1', nn.Linear(Din, Dout)), ('out', nn.SELU()), ('fc2_l2', nn.Linear(Din, Dout)), ])) mdl2 = nn.Sequential(OrderedDict([ ('fc1_l1', nn.Linear(Din, Dout)), ('out', nn.SELU()), ('fc2_l2', nn.Linear(Din, Dout)), ])) print(f'is cuda available: {torch.cuda.is_available()}') with torch.no_grad(): mu = torch.zeros(Din) # std = 1.25e-2 std = 10 noise = torch.distributions.normal.Normal(loc=mu, scale=std).sample() # mdl2.fc1_l1.weight.fill_(50.0) # mdl2.fc1_l1.bias.fill_(50.0) mdl2.fc1_l1.weight += noise mdl2.fc1_l1.bias += noise if torch.cuda.is_available(): mdl1 = mdl1.cuda() mdl2 = mdl2.cuda() hook1 = SimilarityHook(mdl1, "fc1_l1") hook2 = SimilarityHook(mdl2, "fc1_l1") mdl1.eval() mdl2.eval() # params for doing "good" CCA iters = 10 num_samples_per_task = 500 size = 8 # start CCA comparision lb, ub = -1, 1 for _ in range(iters): x = torch.torch.distributions.Uniform(low=-1, high=1).sample((num_samples_per_task, 1)) if torch.cuda.is_available(): x = x.cuda() y1 = mdl1(x) y2 = mdl2(x) print(f'y1 - y2 = {(y1-y2).norm(2)}') print('about to do cca') dist = hook1.distance(hook2, size=size) print('cca done') print(f'cca dist = {dist}') print('--> Done!\a') ``` but it always has a segmentation error: ``` (automl-meta-learning) miranda9~/automl-meta-learning $ python test_cca_gpu.py is cuda available: True y1 - y2 = 4.561897277832031 y1 - y2 = 3.7458858489990234 y1 - y2 = 3.8464999198913574 y1 - y2 = 4.947702407836914 y1 - y2 = 5.404015064239502 y1 - y2 = 4.85843563079834 y1 - y2 = 4.000360488891602 y1 - y2 = 4.194643020629883 y1 - y2 = 4.894904613494873 y1 - y2 = 4.7721710205078125 about to do cca Segmentation fault ``` why? how is this fixed?

when I try to compute CCA with gpu it seems.

ptrblck · January 7, 2021, 10:53pm

In your posted code snippet you are using a ResNet18 from torch.hub, so how is the liked repository used?

Brando_Miranda · January 8, 2021, 4:32pm

I have to apologies. The resnet18 was a different example I used at a different time (which I am not currently debugging as of now). Sorry for the confusion.

Right now I am loading a checkpoint (a simple fully connected net) and using the CCA library to compute CCA values and I am getting segmentation faults. I opened a gitissue here to track the bug source: cca code for GPU code not working · Issue #4 · moskomule/anatome · GitHub. I am unsure where to start looking for the bug if its in that library or in pytorch. It might be best to try to make a tiny reproducible example to figure out where the bug might be. Do you think its a bug with the anatome library for CCA or a pytorch bug?

ptrblck · January 8, 2021, 8:19pm

It’s hard to tell where this bug is coming from and a minimal code snippet would be helpful.
Since you are already running into this issue, you could also use gdb to get the stack trace and post it here, so that we could have a look:

$ gdb --args python my_script.py
...
Reading symbols from python...done.
(gdb) run
...
(gdb) backtrace
...

Brando_Miranda · March 2, 2021, 8:32pm

@ptrblck this is a slightly different issue than our previous conversations…I am running a simple training script with DDP. I tried what you suggested but the output of gdb is empty…literally that’s what gdb says. I am a bit lost

----> about to cleanup worker with rank 0
clean up done successfully! 0
Traceback (most recent call last):
  File "ml4coq-proj/embeddings_zoo/tree_nns/main_brando.py", line 338, in <module>
    main_distributed()
  File "ml4coq-proj/embeddings_zoo/tree_nns/main_brando.py", line 230, in main_distributed
    spawn_return = mp.spawn(fn=train, args=(opts,), nprocs=opts.world_size)
  File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
    while not context.join():
  File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 105, in join
    raise Exception(
Exception: process 0 terminated with signal SIGSEGV
[Thread 0x2aaaaaaf0ec0 (LWP 25285) exited]
[Inferior 1 (process 25285) exited with code 01]
Missing separate debuginfos, use: debuginfo-install glibc-2.17-307.el7.1.x86_64 nvidia-driver-latest-cuda-libs-450.36.06-1.el7.x86_64
(gdb) backtrace
No stack.

related: How to fix a SIGSEGV in pytorch when using distributed training (e.g. DDP)? and more details there of what I’ve done.

ps: I also noticed this happens when rank 0 ends before rank 1 (that helped me reproduce it, otherwise a sigabort SIGABRT happens).

    # clean up distributed code
    torch.distributed.barrier()
    if rank == 1:
        time.sleep(1)
    print(f'\n----> about to cleanup worker with rank {rank}')
    # cleanup(rank)
    torch.distributed.destroy_process_group()
    print(f'clean up done successfully! {rank}'