I would use to(device)
, since it’s easier to write device agnostic code.
In fact, I removed all cuda()
calls from your code on my machine and replaced it with:
device = 'cpu'
#device = 'cuda:0'
model = model.to(device)
to easily switch between CPU and GPU runs.
Here is the backtrace of gdb
:
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
#1 0x00007ffff7805801 in __GI_abort () at abort.c:79
#2 0x00007ffff784e897 in __libc_message (action=action@entry=do_abort,
fmt=fmt@entry=0x7ffff797bb9a "%s\n") at ../sysdeps/posix/libc_fatal.c:181
#3 0x00007ffff785590a in malloc_printerr (
str=str@entry=0x7ffff797d8b8 "free(): invalid next size (normal)") at malloc.c:5350
#4 0x00007ffff785d0ad in _int_free (have_lock=0, p=0x7fff58005310, av=0x7fff58000020)
at malloc.c:4286
#5 __GI___libc_free (mem=0x7fff58005320) at malloc.c:3124
#6 0x00007fffa9cd5b72 in cudnn::maxwell::gemm::conv2d(cudnnContext*, void const*, cudnnTensor4dStruct*, void const*, cudnnFilter4dStruct*, void const*, cudnnConvolutionStruct*, cudnnConvWorkingStruct const*, void*, unsigned long, void const*, cudnnTensor4dStruct*, void*, cudnn::maxwell::gemm::Conv2dType_t, cudnn::maxwell::gemm::Conv2dConfig&, bool, void const*, cudnnActivationStruct*, void*) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#7 0x00007fffa9d4ad0b in cudnn::wgrad2d::invokeFfmaKernel(cudnnContext*, void const*, cudnnTensor4dStruct*, void const*, cudnnTensor4dStruct*, void const*, cudnnConvolutionStruct*, cudnnConvWorkingStruct const*, cudnnConvolutionBwdFilterAlgo_t, void*, unsigned long, void const*, cudnnFilter4dStruct*, void*, cudnnStatus_t*) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#8 0x00007fffa9d4f96e in cudnnConvolution4dBackwardFilter(cudnnContext*, void const*, cudnnTensor4dStruct*, void const*, cudnnTensor4dStruct*, void const*, cudnnConvolutionStruct*, cudnnConvWorkingStruct const*, cudnnConvolutionBwdFilterAlgo_t, void*, unsigned long, void const*, cudnnFilter4dStruct*, void*) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#9 0x00007fffa9a2435c in cudnnConvolutionBackwardFilterInternal(cudnnContext*, void const*, cudnnTensorStruct*, void const*, cudnnTensorStruct*, void const*, cudnnConvolutionStruct*, cudnnConvolutionBwdFilterAlgo_t, void*, unsigned long, void const*, cudnnFilterStruct*, void*) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#10 0x00007fffa9a24b08 in cudnnConvolutionBackwardFilter ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#11 0x00007fffa7306da4 in at::native::raw_cudnn_convolution_backward_weight_out(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#12 0x00007fffa73074f7 in at::native::cudnn_convolution_backward_weight(char const*, c10::ArrayRef<long>, at::TensorArg const&, at::TensorArg const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#13 0x00007fffa7307827 in at::native::cudnn_convolution_backward_weight(c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#14 0x00007fffa73dc66b in at::CUDAFloatType::cudnn_convolution_backward_weight(c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) const ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_g---Type <return> to continue, or q <return> to quit---
pu.so
#15 0x00007fffa73039f2 in at::native::cudnn_convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool, std::array<bool, 3ul>) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#16 0x00007fffa73dc832 in at::CUDAFloatType::cudnn_convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool, std::array<bool, 3ul>) const ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libcaffe2_gpu.so
#17 0x00007fffa0e1966a in torch::autograd::VariableType::cudnn_convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool, std::array<bool, 3ul>) const ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libtorch.so.1
#18 0x00007fffa0c52b66 in torch::autograd::generated::CudnnConvolutionBackward::apply(std::vector<torch::autograd::Variable, std::allocator<torch::autograd::Variable> >&&) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libtorch.so.1
#19 0x00007fffa0c2869e in torch::autograd::Engine::evaluate_function(torch::autograd::FunctionTask&)
()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libtorch.so.1
#20 0x00007fffa0c2a770 in torch::autograd::Engine::thread_main(torch::autograd::GraphTask*) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libtorch.so.1
#21 0x00007fffa0c27222 in torch::autograd::Engine::thread_init(int) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libtorch.so.1
#22 0x00007fffe67c54ca in torch::autograd::python::PythonEngine::thread_init(int) ()
from /home/ptrblck/anaconda3/envs/pytorch_latest/lib/python3.7/site-packages/torch/lib/libtorch_python.so
#23 0x00007fffe7534678 in std::execute_native_thread_routine_compat (__p=<optimized out>)
at /opt/conda/conda-bld/compilers_linux-64_1534514838838/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:94
#24 0x00007ffff7bbd6db in start_thread (arg=0x7fff90ffd700) at pthread_create.c:463
#25 0x00007ffff78e688f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
I’m not sure what’s going on and would like to invite some experts on this topic.
CC @colesbury, @albanD
Any ideas how to debug this issue further?
EDIT:
It looks like this issue is related to cuDNN.
@Dan_Erez
You could set torch.backends.cudnn.enabled = False
at the beginning of your script for now to avoid this problem.