Segmentation fault (core dumped)

arkh · November 3, 2021, 7:15am

I am getting this error while using pytorch lightning with pytorch version 1.9.1, and I am using this exact script on 2x 2080 GPUs.
Any help would be appreciated.
Thanks!

ptrblck · November 3, 2021, 7:26am

Could you check the backtrace via:

gdb --args python script.py args
...
run
...
bt

and post it here, please?

arkh · November 3, 2021, 7:59am

Thanks a lot for helping out!
Here’s the backtrace:

#0  0x00007f39d502bea1 in google::protobuf::internal::ReflectionOps::FindInitializationErrors(google::protobuf::Message const&, std::string const&, std::vector<std::string, std::allocator<std::string> >*) ()
   from /opt/conda/lib/python3.7/site-packages/google/protobuf/pyext/_message.cpython-37m-x86_64-linux-gnu.so
#1  0x00007f39d5024868 in google::protobuf::Message::FindInitializationErrors(std::vector<std::string, std::allocator<std::string> >*) const ()
   from /opt/conda/lib/python3.7/site-packages/google/protobuf/pyext/_message.cpython-37m-x86_64-linux-gnu.so
#2  0x00007f39d50248f9 in google::protobuf::Message::InitializationErrorString() const ()
   from /opt/conda/lib/python3.7/site-packages/google/protobuf/pyext/_message.cpython-37m-x86_64-linux-gnu.so
#3  0x00007f39ddd920c5 in bool google::protobuf::internal::MergeFromImpl<false>(google::protobuf::StringPiece, google::protobuf::MessageLite*, google::protobuf::MessageLite::ParseFlags) ()
   from /opt/conda/lib/python3.7/site-packages/torchtext/_torchtext.so
#4  0x00007f39ddd90559 in google::protobuf::MessageLite::ParseFromArray(void const*, int) ()
   from /opt/conda/lib/python3.7/site-packages/torchtext/_torchtext.so
#5  0x00007f39d4ff7889 in google::protobuf::EncodedDescriptorDatabase::Add(void const*, int) ()
   from /opt/conda/lib/python3.7/site-packages/google/protobuf/pyext/_message.cpython-37m-x86_64-linux-gnu.so
#6  0x00007f39d4fa0170 in google::protobuf::DescriptorPool::InternalAddGeneratedFile(void const*, int) ()
   from /opt/conda/lib/python3.7/site-packages/google/protobuf/pyext/_message.cpython-37m-x86_64-linux-gnu.so
#7  0x00007f39d50032a7 in ?? ()
   from /opt/conda/lib/python3.7/site-packages/google/protobuf/pyext/_message.cpython-37m-x86_64-linux-gnu.so
#8  0x00007f3a2f1458d3 in call_init (env=0x555b14020cc0, argv=0x7ffd1432d178, argc=6, l=<optimized out>) at dl-init.c:72
#9  _dl_init (main_map=main_map@entry=0x555b1875bc10, argc=6, argv=0x7ffd1432d178, env=0x555b14020cc0) at dl-init.c:119
#10 0x00007f3a2f14a39f in dl_open_worker (a=a@entry=0x7ffd1431c6f0) at dl-open.c:522
#11 0x00007f3a2e4e71ef in __GI__dl_catch_exception (exception=0x7ffd1431c6d0, operate=0x7f3a2f149f60 <dl_open_worker>,
    args=0x7ffd1431c6f0) at dl-error-skeleton.c:196
#12 0x00007f3a2f14996a in _dl_open (
    file=0x7f39dd19ab90 "/opt/conda/lib/python3.7/site-packages/google/protobuf/pyext/_message.cpython-37m-x86_64-linux-gnu.so", mode=-2147483390, caller_dlopen=0x555b139337bc <_PyImport_FindSharedFuncptr+140>, nsid=<optimized out>, argc=6,
    argv=<optimized out>, env=0x555b14020cc0) at dl-open.c:605
#13 0x00007f3a2ef31f96 in dlopen_doit (a=a@entry=0x7ffd1431c920) at dlopen.c:66
#14 0x00007f3a2e4e71ef in __GI__dl_catch_exception (exception=exception@entry=0x7ffd1431c8c0,
    operate=0x7f3a2ef31f40 <dlopen_doit>, args=0x7ffd1431c920) at dl-error-skeleton.c:196
#15 0x00007f3a2e4e727f in __GI__dl_catch_error (objname=0x555b13fb3a50, errstring=0x555b13fb3a58,
    mallocedp=0x555b13fb3a48, operate=<optimized out>, args=<optimized out>) at dl-error-skeleton.c:215
#16 0x00007f3a2ef32745 in _dlerror_run (operate=operate@entry=0x7f3a2ef31f40 <dlopen_doit>,
    args=args@entry=0x7ffd1431c920) at dlerror.c:162
#17 0x00007f3a2ef32051 in __dlopen (file=<optimized out>, mode=<optimized out>) at dlopen.c:87
#18 0x0000555b139337bc in _PyImport_FindSharedFuncptr ()
    at /tmp/build/80754af9/python_1627392990942/work/Python/dynload_shlib.c:96
#19 0x0000555b13952c28 in _PyImport_LoadDynamicModuleWithSpec ()
    at /tmp/build/80754af9/python_1627392990942/work/Python/importdl.c:129
#20 0x0000555b13952e73 in _imp_create_dynamic_impl.isra.15 (file=0x0, spec=0x7f39dd19c810)
    at /tmp/build/80754af9/python_1627392990942/work/Python/import.c:2174
#21 _imp_create_dynamic () at /tmp/build/80754af9/python_1627392990942/work/Python/clinic/import.c.h:289```

ptrblck · November 3, 2021, 8:04am

It seems that protobuf is failing in torchtext. Do you have a minimal code snippet with would reproduce this issue?

arkh · November 3, 2021, 8:11am

import os
import argparse
import multiprocessing
from pathlib import Path
from PIL import Image
import pandas as pd
from config import CFG

import torch
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset

from byol_pytorch import BYOL
import pytorch_lightning as pl

# test model, a resnet 50

resnet = models.resnet50(pretrained=True)

# constants

BATCH_SIZE = 1024
EPOCHS = 5
LR = 3e-4
NUM_GPUS = 2
IMAGE_SIZE = 64
NUM_WORKERS = multiprocessing.cpu_count()

# pytorch lightning module

class SelfSupervisedLearner(pl.LightningModule):
    def __init__(self, net, **kwargs):
        super().__init__()
        self.learner = BYOL(net, **kwargs)

    def forward(self, images):
        return self.learner(images)

    def training_step(self, images, _):
        loss = self.forward(images)
        return {'loss': loss}

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=LR)

    def on_before_zero_grad(self, _):
        if self.learner.use_momentum:
            self.learner.update_moving_average()

# main

if __name__ == '__main__':
    model = SelfSupervisedLearner(
        resnet,
        image_size = IMAGE_SIZE,
        hidden_layer = 'avgpool',
        projection_size = 256,
        projection_hidden_size = 4096,
        moving_average_decay = 0.99
    )

    trainer = pl.Trainer(
        gpus = NUM_GPUS,
        max_epochs = EPOCHS,
        accumulate_grad_batches = 1,
        sync_batchnorm = True
    )
    train_loader = torch.randn(BATCH_SIZE, 3, IMG_SIZE, IMG_SIZE)
    trainer.fit(model, train_loader)

arkh · November 3, 2021, 8:50am

@ptrblck Would this work?