Free(): invalid next size (normal) with ported cpp code

I ported a CPP code into my PyTorch code. Every time I start training I randomly see this error at random batches during training. I couldn’t debug it and I don’t know what can cause the problem.

free(): invalid next size (normal)

I use the below script to compile and port the cpp extension

from setuptools import setup, Extension
from torch.utils import cpp_extension
import os

setup(name='span_prune_cpp',
      ext_modules=[cpp_extension.CppExtension('span_prune_cpp', ['span_prune.cpp'])],
      cmdclass={'build_ext': cpp_extension.BuildExtension})

Cpp code if it helps in anyway

#include <torch/extension.h>

#include <cstdio>
#include <time.h>
#include <stdlib.h>

#include <iostream>
#include <pybind11/pybind11.h>

using namespace std;


torch::Tensor extract_spans(
    torch::Tensor span_scores,
    torch::Tensor candidate_starts,
    torch::Tensor candidate_ends,
    torch::Tensor num_output_spans,
    int max_sentence_length,
    bool _sort_spans,
    bool _suppress_crossing
) {

   time_t givemetime = time(NULL);

    int num_sentences = span_scores.size(0);
    int num_input_spans = span_scores.size(1);
    int max_num_output_spans = 0;


    for (int i = 0; i < num_sentences; i++) {

      if (num_output_spans[i].item<int64_t>() > max_num_output_spans) {
        max_num_output_spans = num_output_spans[i].item<int64_t>();
      }
    }

    std::vector<std::vector<int>> sorted_input_span_indices(num_sentences,
                                                            std::vector<int>(num_input_spans));

    torch::Tensor output_span_indices = torch::ones({num_sentences, max_num_output_spans});

    for (int i = 0; i < num_sentences; i++) {
      std::iota(sorted_input_span_indices[i].begin(), sorted_input_span_indices[i].end(), 0);
      std::sort(sorted_input_span_indices[i].begin(), sorted_input_span_indices[i].end(),
                [&span_scores, &i](int j1, int j2) {
                 if (j1 >= span_scores.size(1) || j1 < 0 || j2 >= span_scores.size(1) || j2 < 0) {
                    return false;
                 }

                  return span_scores[i][j2].item<int64_t>() < span_scores[i][j1].item<int64_t>();
                });
    }


    for (int l = 0; l < num_sentences; l++) {

      std::vector<int> top_span_indices;
      std::unordered_map<int, int> end_to_earliest_start;
      std::unordered_map<int, int> start_to_latest_end;
      int current_span_index = 0, num_selected_spans = 0;

      while (num_selected_spans < num_output_spans[l].item<int64_t>() && current_span_index < num_input_spans) {
        int i = sorted_input_span_indices[l][current_span_index];
        bool any_crossing = false;
        if (_suppress_crossing) {
          const int& start = candidate_starts[l][i].item<int64_t>();
          const int& end = candidate_ends[l][i].item<int64_t>();

          for (int j = start; j <= end; ++j) {
            if (j > start) {
              auto latest_end_iter = start_to_latest_end.find(j);
              if (latest_end_iter != start_to_latest_end.end() && latest_end_iter->second > end) {
                // Given (), exists [], such that ( [ ) ]
                any_crossing = true;
                break;
              }
            }
            if (j < end) {
              auto earliest_start_iter = end_to_earliest_start.find(j);
              if (earliest_start_iter != end_to_earliest_start.end() && earliest_start_iter->second < start) {
                // Given (), exists [], such that [ ( ] )
                any_crossing = true;
                break;
              }
            }
          }
        }
        if (!any_crossing) {
          if (_sort_spans) {
            top_span_indices.push_back(i);
          } else {
            output_span_indices[l][num_selected_spans] = i;
          }
          ++num_selected_spans;
          if (_suppress_crossing) {
            // Update data struct.
            const int& start = candidate_starts[l][i].item<int64_t>();
            const int& end = candidate_ends[l][i].item<int64_t>();
            auto latest_end_iter = start_to_latest_end.find(start);
            if (latest_end_iter == start_to_latest_end.end() || end > latest_end_iter->second) {
              start_to_latest_end[start] = end;
            }
            auto earliest_start_iter = end_to_earliest_start.find(end);
            if (earliest_start_iter == end_to_earliest_start.end() || start < earliest_start_iter->second) {
              end_to_earliest_start[end] = start;
            }
          }
        }
        ++current_span_index;
      }


      // Sort and produce span indices.
      if (_sort_spans) {

        std::sort(top_span_indices.begin(), top_span_indices.end(),
                [&candidate_starts, &candidate_ends, &l] (int i1, int i2) {
                 if (i1 >= candidate_starts.size(1) || i1 < 0 || i2 >= candidate_starts.size(1) || i2 < 0) {
                    return false;
                 }
                  if (candidate_starts[l][i1].item<int64_t>() < candidate_starts[l][i2].item<int64_t>()) {
                    return true;
                  } else if (candidate_starts[l][i1].item<int64_t>() > candidate_starts[l][i1].item<int64_t>()) {
                    return false;
                  } else if (candidate_ends[l][i1].item<int64_t>() < candidate_ends[l][i1].item<int64_t>()) {
                    return true;
                  } else if (candidate_ends[l][i1].item<int64_t>() > candidate_ends[l][i1].item<int64_t>()) {
                    return false;
                  } else {
                    return i1 < i2;
                  }
                });



        for (int i = 0; i < num_output_spans[l].item<int64_t>(); ++i) {
          output_span_indices[l][i] = top_span_indices[i];
        }
      }

      // Pad with the last selected span index to ensure monotonicity.
      int last_selected = num_selected_spans - 1;
      if (last_selected >= 0) {
        for (int i = num_selected_spans; i < max_num_output_spans; ++i) {
          output_span_indices[l][i]= output_span_indices[l][last_selected].item<int64_t>();
        }
      }


    }


    return output_span_indices;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("extract_spans", &extract_spans, "extract_spans");
}

Please let me know if I can share any additional information. I debugged all tensors fed to the extract_spans function and it holds valid numerical values.

Hi,

Did you run your program in gdb to know where the error comes from by any chance?

I didn’t run it in gdb because it sounds complicated, I think I will have to compile pytorch again, is that right? Plus this is totally random and it occurs after many batches (which takes hours) so I am not quite sure how to approach this. I tried logging progress to a file but still cannot infer the root cause of the issue

Hi,

No it is quite simple.
If you run your script with python your_script.py --args foo usually. You can do:

gdb python
# Wait to be in gdb cli
r your_script.py --args foo
# Wait for your script to crash
# Then you can use `bt` to get a stack trace
bt

This will give us quite a bit of information even without a debug build :slight_smile:

Cool I don’t have gdb installed on my server but will reach out to the server admin. Thanks a lot!

1 Like

I got this stack trace today.

#0  0x00007ffff7803f47 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007ffff78058b1 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007ffff784e907 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#3  0x00007ffff785597a in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#4  0x00007ffff785cef4 in free () from /lib/x86_64-linux-gnu/libc.so.6
#5  0x00007fff5506a425 in __gnu_cxx::new_allocator<int>::deallocate (this=0x7fffffffcfd0, __p=<optimized out>)
    at /usr/include/c++/7/ext/new_allocator.h:125
#6  std::allocator_traits<std::allocator<int> >::deallocate (__a=..., __n=<optimized out>, __p=<optimized out>)
    at /usr/include/c++/7/bits/alloc_traits.h:462
#7  std::_Vector_base<int, std::allocator<int> >::_M_deallocate (this=0x7fffffffcfd0, __n=<optimized out>, __p=<optimized out>)
    at /usr/include/c++/7/bits/stl_vector.h:180
#8  std::_Vector_base<int, std::allocator<int> >::~_Vector_base (this=0x7fffffffcfd0, __in_chrg=<optimized out>)
    at /usr/include/c++/7/bits/stl_vector.h:162
#9  std::vector<int, std::allocator<int> >::~vector (this=0x7fffffffcfd0, __in_chrg=<optimized out>) at /usr/include/c++/7/bits/stl_vector.h:435
#10 extract_spans (span_scores=..., candidate_starts=..., candidate_ends=..., num_output_spans=..., max_sentence_length=<optimized out>,
    _sort_spans=<optimized out>, _suppress_crossing=<optimized out>) at span_prune.cpp:75
#11 0x00007fff5507c0c6 in pybind11::detail::argument_loader<at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool>::call_impl<at::Tensor, at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool), 0ul, 1ul, 2ul, 3ul, 4ul, 5ul, 6ul, pybind11::detail::void_type> (
    f=<optimized out>, this=0x7fffffffd460)
    at /mnt/data2/hussein/miniconda3/envs/torch-sci/lib/python3.7/site-packages/torch/include/pybind11/cast.h:1931
#12 pybind11::detail::argument_loader<at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool>::call<at::Tensor, pybind11::detail::void_type, at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool)>(at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool)) && (f=<optimized out>, this=<optimized out>)
    at /mnt/data2/hussein/miniconda3/envs/torch-sci/lib/python3.7/site-packages/torch/include/pybind11/cast.h:1908
#13 pybind11::cpp_function::initialize<at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool), at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool, pybind11::name, pybind11::scope, pybind11::sibling, char [14]>(at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool), at::Tensor (*)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool), pybind11::name const&, pybind11::scope const&, pybind11::sibling const&, char const (&) [14])::{lambda(pybind11::detail::function_call&)#3}::operator()(pybind11::detail::function_call&) const (__closure=<optimized out>, call=...)
    at /mnt/data2/hussein/miniconda3/envs/torch-sci/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:155
#14 0x00007fff55078690 in pybind11::cpp_function::dispatcher (self=<optimized out>, args_in=0x7ffec673e4b0, kwargs_in=0x0)
    at /mnt/data2/hussein/miniconda3/envs/torch-sci/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:620
#15 0x00005555556b9c94 in _PyMethodDef_RawFastCallKeywords (method=0x5555574dfb40, self=0x7ffff5f24750, args=0x555625428c58,
    nargs=<optimized out>, kwnames=<optimized out>) at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:694
#16 0x00005555556b9db1 in _PyCFunction_FastCallKeywords (func=0x7fff5511ebe0, args=<optimized out>, nargs=<optimized out>, kwnames=<optimized out>)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:734
#17 0x00005555557255be in call_function (kwnames=0x0, oparg=7, pp_stack=<synthetic pointer>)
    at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:4568
#18 _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:3093
#19 0x000055555566a31b in function_code_fastcall (globals=<optimized out>, nargs=2, args=<optimized out>, co=0x7ffef5921a50)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:283
#20 _PyFunction_FastCallDict (func=<optimized out>, args=0x7fffffffd960, nargs=2, kwargs=<optimized out>)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:322
#21 0x0000555555688b93 in _PyObject_Call_Prepend (callable=0x7ffef5928320, obj=<optimized out>, args=0x7ffec68de050, kwargs=0x7ffec6a20410)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:908
#22 0x000055555567b95e in PyObject_Call (callable=0x7ffec6abbaa0, args=<optimized out>, kwargs=<optimized out>)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:245
#23 0x000055555572251a in do_call_core (kwdict=0x7ffec6a20410, callargs=0x7ffec68de050, func=0x7ffec6abbaa0)
    at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:4645
#24 _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:3191
#25 0x00005555556692b9 in _PyEval_EvalCodeWithName (_co=0x7fff556e6030, globals=<optimized out>, locals=<optimized out>, args=<optimized out>,
    argcount=<optimized out>, kwnames=0x0, kwargs=0x0, kwcount=<optimized out>, kwstep=2, defs=0x0, defcount=0, kwdefs=0x0, closure=0x0,
    name=0x7ffff6c651b0, qualname=0x7fff556e5230) at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:3930
#26 0x000055555566a3e5 in _PyFunction_FastCallDict (func=<optimized out>, args=0x7fffffffdcc0, nargs=2, kwargs=<optimized out>)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:376

Any idea what is going on here?

The errors points at the destructor of a std::vector<int>.
Are you doing anything funky with one of these? Like stealing it’s data? Or building it with stolen data? :smiley:

I don’t think I am doing any illegal operation to it.

I first initialised it (Any btw this is the only std::vector<int> in my code)

std::vector<int> top_span_indices;

Push some data into it

top_span_indices.push_back(i);

Sort all elements with a custom comparator

std::sort(top_span_indices.begin(), top_span_indices.end(),
                [&candidate_starts, &candidate_ends, &l] (int i1, int i2) {
                 if (i1 >= candidate_starts.size(1) || i1 < 0 || i2 >= candidate_starts.size(1) || i2 < 0) {
                    return false;
                 }
                  if (candidate_starts[l][i1].item<int64_t>() < candidate_starts[l][i2].item<int64_t>()) {
                    return true;
                  } else if (candidate_starts[l][i1].item<int64_t>() > candidate_starts[l][i2].item<int64_t>()) {
                    return false;
                  } else if (candidate_ends[l][i1].item<int64_t>() < candidate_ends[l][i2].item<int64_t>()) {
                    return true;
                  } else if (candidate_ends[l][i1].item<int64_t>() > candidate_ends[l][i2].item<int64_t>()) {
                    return false;
                  } else {
                    return i1 < i2;
                  }
                });

And finally copied its data (which I believe is copying by value here)

for (int i = 0; i < num_output_spans[l].item<int64_t>(); ++i) {
          output_span_indices[l][i] = top_span_indices[i];
        }

Do you think am doing anything wrong here?

Could it be that vector too?
std::vector<std::vector<int>> sorted_input_span_indices(num_sentences, std::vector<int>(num_input_spans));

It’s hard to say.
The way I would debug this is remove pieces of the code until it works fine. And then add them back one by one. Until you find the one thing that makes it fail :slight_smile:

Ok wil give it a try. Thanks!