Free(): invalid next size (normal) with ported cpp code

Ahmed_Abdelaziz · August 25, 2020, 4:00pm

I ported a CPP code into my PyTorch code. Every time I start training I randomly see this error at random batches during training. I couldn’t debug it and I don’t know what can cause the problem.

free(): invalid next size (normal)

I use the below script to compile and port the cpp extension

from setuptools import setup, Extension
from torch.utils import cpp_extension
import os

setup(name='span_prune_cpp',
      ext_modules=[cpp_extension.CppExtension('span_prune_cpp', ['span_prune.cpp'])],
      cmdclass={'build_ext': cpp_extension.BuildExtension})

Cpp code if it helps in anyway

#include <torch/extension.h>

#include <cstdio>
#include <time.h>
#include <stdlib.h>

#include <iostream>
#include <pybind11/pybind11.h>

using namespace std;


torch::Tensor extract_spans(
    torch::Tensor span_scores,
    torch::Tensor candidate_starts,
    torch::Tensor candidate_ends,
    torch::Tensor num_output_spans,
    int max_sentence_length,
    bool _sort_spans,
    bool _suppress_crossing
) {

   time_t givemetime = time(NULL);

    int num_sentences = span_scores.size(0);
    int num_input_spans = span_scores.size(1);
    int max_num_output_spans = 0;


    for (int i = 0; i < num_sentences; i++) {

      if (num_output_spans[i].item<int64_t>() > max_num_output_spans) {
        max_num_output_spans = num_output_spans[i].item<int64_t>();
      }
    }

    std::vector<std::vector<int>> sorted_input_span_indices(num_sentences,
                                                            std::vector<int>(num_input_spans));

    torch::Tensor output_span_indices = torch::ones({num_sentences, max_num_output_spans});

    for (int i = 0; i < num_sentences; i++) {
      std::iota(sorted_input_span_indices[i].begin(), sorted_input_span_indices[i].end(), 0);
      std::sort(sorted_input_span_indices[i].begin(), sorted_input_span_indices[i].end(),
                [&span_scores, &i](int j1, int j2) {
                 if (j1 >= span_scores.size(1) || j1 < 0 || j2 >= span_scores.size(1) || j2 < 0) {
                    return false;
                 }

                  return span_scores[i][j2].item<int64_t>() < span_scores[i][j1].item<int64_t>();
                });
    }


    for (int l = 0; l < num_sentences; l++) {

      std::vector<int> top_span_indices;
      std::unordered_map<int, int> end_to_earliest_start;
      std::unordered_map<int, int> start_to_latest_end;
      int current_span_index = 0, num_selected_spans = 0;

      while (num_selected_spans < num_output_spans[l].item<int64_t>() && current_span_index < num_input_spans) {
        int i = sorted_input_span_indices[l][current_span_index];
        bool any_crossing = false;
        if (_suppress_crossing) {
          const int& start = candidate_starts[l][i].item<int64_t>();
          const int& end = candidate_ends[l][i].item<int64_t>();

          for (int j = start; j <= end; ++j) {
            if (j > start) {
              auto latest_end_iter = start_to_latest_end.find(j);
              if (latest_end_iter != start_to_latest_end.end() && latest_end_iter->second > end) {
                // Given (), exists [], such that ( [ ) ]
                any_crossing = true;
                break;
              }
            }
            if (j < end) {
              auto earliest_start_iter = end_to_earliest_start.find(j);
              if (earliest_start_iter != end_to_earliest_start.end() && earliest_start_iter->second < start) {
                // Given (), exists [], such that [ ( ] )
                any_crossing = true;
                break;
              }
            }
          }
        }
        if (!any_crossing) {
          if (_sort_spans) {
            top_span_indices.push_back(i);
          } else {
            output_span_indices[l][num_selected_spans] = i;
          }
          ++num_selected_spans;
          if (_suppress_crossing) {
            // Update data struct.
            const int& start = candidate_starts[l][i].item<int64_t>();
            const int& end = candidate_ends[l][i].item<int64_t>();
            auto latest_end_iter = start_to_latest_end.find(start);
            if (latest_end_iter == start_to_latest_end.end() || end > latest_end_iter->second) {
              start_to_latest_end[start] = end;
            }
            auto earliest_start_iter = end_to_earliest_start.find(end);
            if (earliest_start_iter == end_to_earliest_start.end() || start < earliest_start_iter->second) {
              end_to_earliest_start[end] = start;
            }
          }
        }
        ++current_span_index;
      }


      // Sort and produce span indices.
      if (_sort_spans) {

        std::sort(top_span_indices.begin(), top_span_indices.end(),
                [&candidate_starts, &candidate_ends, &l] (int i1, int i2) {
                 if (i1 >= candidate_starts.size(1) || i1 < 0 || i2 >= candidate_starts.size(1) || i2 < 0) {
                    return false;
                 }
                  if (candidate_starts[l][i1].item<int64_t>() < candidate_starts[l][i2].item<int64_t>()) {
                    return true;
                  } else if (candidate_starts[l][i1].item<int64_t>() > candidate_starts[l][i1].item<int64_t>()) {
                    return false;
                  } else if (candidate_ends[l][i1].item<int64_t>() < candidate_ends[l][i1].item<int64_t>()) {
                    return true;
                  } else if (candidate_ends[l][i1].item<int64_t>() > candidate_ends[l][i1].item<int64_t>()) {
                    return false;
                  } else {
                    return i1 < i2;
                  }
                });



        for (int i = 0; i < num_output_spans[l].item<int64_t>(); ++i) {
          output_span_indices[l][i] = top_span_indices[i];
        }
      }

      // Pad with the last selected span index to ensure monotonicity.
      int last_selected = num_selected_spans - 1;
      if (last_selected >= 0) {
        for (int i = num_selected_spans; i < max_num_output_spans; ++i) {
          output_span_indices[l][i]= output_span_indices[l][last_selected].item<int64_t>();
        }
      }


    }


    return output_span_indices;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("extract_spans", &extract_spans, "extract_spans");
}

Please let me know if I can share any additional information. I debugged all tensors fed to the extract_spans function and it holds valid numerical values.

albanD · August 25, 2020, 5:10pm

Hi,

Did you run your program in gdb to know where the error comes from by any chance?

Ahmed_Abdelaziz · August 25, 2020, 5:20pm

I didn’t run it in gdb because it sounds complicated, I think I will have to compile pytorch again, is that right? Plus this is totally random and it occurs after many batches (which takes hours) so I am not quite sure how to approach this. I tried logging progress to a file but still cannot infer the root cause of the issue

albanD · August 25, 2020, 5:28pm

Hi,

No it is quite simple.
If you run your script with python your_script.py --args foo usually. You can do:

gdb python
# Wait to be in gdb cli
r your_script.py --args foo
# Wait for your script to crash
# Then you can use `bt` to get a stack trace
bt

This will give us quite a bit of information even without a debug build

Ahmed_Abdelaziz · August 25, 2020, 5:30pm

Cool I don’t have gdb installed on my server but will reach out to the server admin. Thanks a lot!

Ahmed_Abdelaziz · August 31, 2020, 6:14pm

I got this stack trace today.

#0  0x00007ffff7803f47 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007ffff78058b1 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007ffff784e907 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#3  0x00007ffff785597a in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#4  0x00007ffff785cef4 in free () from /lib/x86_64-linux-gnu/libc.so.6
#5  0x00007fff5506a425 in __gnu_cxx::new_allocator<int>::deallocate (this=0x7fffffffcfd0, __p=<optimized out>)
    at /usr/include/c++/7/ext/new_allocator.h:125
#6  std::allocator_traits<std::allocator<int> >::deallocate (__a=..., __n=<optimized out>, __p=<optimized out>)
    at /usr/include/c++/7/bits/alloc_traits.h:462
#7  std::_Vector_base<int, std::allocator<int> >::_M_deallocate (this=0x7fffffffcfd0, __n=<optimized out>, __p=<optimized out>)
    at /usr/include/c++/7/bits/stl_vector.h:180
#8  std::_Vector_base<int, std::allocator<int> >::~_Vector_base (this=0x7fffffffcfd0, __in_chrg=<optimized out>)
    at /usr/include/c++/7/bits/stl_vector.h:162
#9  std::vector<int, std::allocator<int> >::~vector (this=0x7fffffffcfd0, __in_chrg=<optimized out>) at /usr/include/c++/7/bits/stl_vector.h:435
#10 extract_spans (span_scores=..., candidate_starts=..., candidate_ends=..., num_output_spans=..., max_sentence_length=<optimized out>,
    _sort_spans=<optimized out>, _suppress_crossing=<optimized out>) at span_prune.cpp:75
#11 0x00007fff5507c0c6 in pybind11::detail::argument_loader<at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool>::call_impl<at::Tensor, at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool), 0ul, 1ul, 2ul, 3ul, 4ul, 5ul, 6ul, pybind11::detail::void_type> (
    f=<optimized out>, this=0x7fffffffd460)
    at /mnt/data2/hussein/miniconda3/envs/torch-sci/lib/python3.7/site-packages/torch/include/pybind11/cast.h:1931
#12 pybind11::detail::argument_loader<at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool>::call<at::Tensor, pybind11::detail::void_type, at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool)>(at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool)) && (f=<optimized out>, this=<optimized out>)
    at /mnt/data2/hussein/miniconda3/envs/torch-sci/lib/python3.7/site-packages/torch/include/pybind11/cast.h:1908
#13 pybind11::cpp_function::initialize<at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool), at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool, pybind11::name, pybind11::scope, pybind11::sibling, char [14]>(at::Tensor (*&)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool), at::Tensor (*)(at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, bool, bool), pybind11::name const&, pybind11::scope const&, pybind11::sibling const&, char const (&) [14])::{lambda(pybind11::detail::function_call&)#3}::operator()(pybind11::detail::function_call&) const (__closure=<optimized out>, call=...)
    at /mnt/data2/hussein/miniconda3/envs/torch-sci/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:155
#14 0x00007fff55078690 in pybind11::cpp_function::dispatcher (self=<optimized out>, args_in=0x7ffec673e4b0, kwargs_in=0x0)
    at /mnt/data2/hussein/miniconda3/envs/torch-sci/lib/python3.7/site-packages/torch/include/pybind11/pybind11.h:620
#15 0x00005555556b9c94 in _PyMethodDef_RawFastCallKeywords (method=0x5555574dfb40, self=0x7ffff5f24750, args=0x555625428c58,
    nargs=<optimized out>, kwnames=<optimized out>) at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:694
#16 0x00005555556b9db1 in _PyCFunction_FastCallKeywords (func=0x7fff5511ebe0, args=<optimized out>, nargs=<optimized out>, kwnames=<optimized out>)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:734
#17 0x00005555557255be in call_function (kwnames=0x0, oparg=7, pp_stack=<synthetic pointer>)
    at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:4568
#18 _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:3093
#19 0x000055555566a31b in function_code_fastcall (globals=<optimized out>, nargs=2, args=<optimized out>, co=0x7ffef5921a50)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:283
#20 _PyFunction_FastCallDict (func=<optimized out>, args=0x7fffffffd960, nargs=2, kwargs=<optimized out>)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:322
#21 0x0000555555688b93 in _PyObject_Call_Prepend (callable=0x7ffef5928320, obj=<optimized out>, args=0x7ffec68de050, kwargs=0x7ffec6a20410)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:908
#22 0x000055555567b95e in PyObject_Call (callable=0x7ffec6abbaa0, args=<optimized out>, kwargs=<optimized out>)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:245
#23 0x000055555572251a in do_call_core (kwdict=0x7ffec6a20410, callargs=0x7ffec68de050, func=0x7ffec6abbaa0)
    at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:4645
#24 _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:3191
#25 0x00005555556692b9 in _PyEval_EvalCodeWithName (_co=0x7fff556e6030, globals=<optimized out>, locals=<optimized out>, args=<optimized out>,
    argcount=<optimized out>, kwnames=0x0, kwargs=0x0, kwcount=<optimized out>, kwstep=2, defs=0x0, defcount=0, kwdefs=0x0, closure=0x0,
    name=0x7ffff6c651b0, qualname=0x7fff556e5230) at /tmp/build/80754af9/python_1588882889832/work/Python/ceval.c:3930
#26 0x000055555566a3e5 in _PyFunction_FastCallDict (func=<optimized out>, args=0x7fffffffdcc0, nargs=2, kwargs=<optimized out>)
    at /tmp/build/80754af9/python_1588882889832/work/Objects/call.c:376

Any idea what is going on here?

albanD · August 31, 2020, 6:40pm

The errors points at the destructor of a std::vector<int>.
Are you doing anything funky with one of these? Like stealing it’s data? Or building it with stolen data?

Ahmed_Abdelaziz · September 1, 2020, 9:16am

I don’t think I am doing any illegal operation to it.

I first initialised it (Any btw this is the only std::vector<int> in my code)

std::vector<int> top_span_indices;

Push some data into it

top_span_indices.push_back(i);

Sort all elements with a custom comparator

std::sort(top_span_indices.begin(), top_span_indices.end(),
                [&candidate_starts, &candidate_ends, &l] (int i1, int i2) {
                 if (i1 >= candidate_starts.size(1) || i1 < 0 || i2 >= candidate_starts.size(1) || i2 < 0) {
                    return false;
                 }
                  if (candidate_starts[l][i1].item<int64_t>() < candidate_starts[l][i2].item<int64_t>()) {
                    return true;
                  } else if (candidate_starts[l][i1].item<int64_t>() > candidate_starts[l][i2].item<int64_t>()) {
                    return false;
                  } else if (candidate_ends[l][i1].item<int64_t>() < candidate_ends[l][i2].item<int64_t>()) {
                    return true;
                  } else if (candidate_ends[l][i1].item<int64_t>() > candidate_ends[l][i2].item<int64_t>()) {
                    return false;
                  } else {
                    return i1 < i2;
                  }
                });

And finally copied its data (which I believe is copying by value here)

for (int i = 0; i < num_output_spans[l].item<int64_t>(); ++i) {
          output_span_indices[l][i] = top_span_indices[i];
        }

Do you think am doing anything wrong here?

Ahmed_Abdelaziz · September 1, 2020, 9:27am

Could it be that vector too?
std::vector<std::vector<int>> sorted_input_span_indices(num_sentences, std::vector<int>(num_input_spans));

albanD · September 1, 2020, 2:11pm

It’s hard to say.
The way I would debug this is remove pieces of the code until it works fine. And then add them back one by one. Until you find the one thing that makes it fail

Ahmed_Abdelaziz · September 1, 2020, 5:27pm

Ok wil give it a try. Thanks!