This is a weird cuda error that I get. If someone could just explain what it means or what is causing it, that would be awesome!
Traceback (most recent call last):
File “main.py”, line 40, in
train_correspondence_block(root_dir)
File “/home/jovyan/work/correspondence_block.py”, line 82, in train_correspondence_block
loss.backward()
File “/opt/venv/lib/python3.7/site-packages/torch/tensor.py”, line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File “/opt/venv/lib/python3.7/site-packages/torch/autograd/init.py”, line 100, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED (createCuDNNHandle at /pytorch/aten/src/ATen/cudnn/Handle.cpp:9)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x46 (0x7f0c87e01536 in /opt/venv/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: + 0x10b29d8 (0x7f0c8930f9d8 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #2: at::native::getCudnnHandle() + 0xe54 (0x7f0c893111b4 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #3: + 0xf2bcfc (0x7f0c89188cfc in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #4: + 0xf2cd91 (0x7f0c89189d91 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #5: + 0xf30dcb (0x7f0c8918ddcb in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #6: at::native::cudnn_convolution_backward_input(c10::ArrayRef, at::Tensor const&, at::Tensor const&, c10::ArrayRef, c10::ArrayRef, c10::ArrayRef, long, bool, bool) + 0xb2 (0x7f0c8918e322 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #7: + 0xf97e40 (0x7f0c891f4e40 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #8: + 0xfdc6d8 (0x7f0c892396d8 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #9: at::native::cudnn_convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef, c10::ArrayRef, c10::ArrayRef, long, bool, bool, std::array<bool, 2ul>) + 0x4fa (0x7f0c8918f9ba in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #10: + 0xf9816b (0x7f0c891f516b in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #11: + 0xfdc734 (0x7f0c89239734 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #12: + 0x2c809b6 (0x7f0cc278f9b6 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #13: + 0x2cd0444 (0x7f0cc27df444 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::generated::CudnnConvolutionBackward::apply(std::vector<at::Tensor, std::allocatorat::Tensor >&&) + 0x378 (0x7f0cc23a7918 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #15: + 0x2d89c05 (0x7f0cc2898c05 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #16: torch::autograd::Engine::evaluate_function(std::shared_ptrtorch::autograd::GraphTask&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x16f3 (0x7f0cc2895f03 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::Engine::thread_main(std::shared_ptrtorch::autograd::GraphTask const&, bool) + 0x3d2 (0x7f0cc2896ce2 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #18: torch::autograd::Engine::thread_init(int) + 0x39 (0x7f0cc288f359 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #19: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7f0ccefce828 in /opt/venv/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #20: + 0xb9e6f (0x7f0cf1310e6f in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #21: + 0x74a4 (0x7f0cf8b804a4 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #22: clone + 0x3f (0x7f0cf81b7d0f in /lib/x86_64-linux-gnu/libc.so.6)
This is my train_correspondence_block function:
def train_correspondence_block(root_dir,epochs = 20): # Loop to train the correspondence block
# dataset for correspondence block train_data_CB = OcclusionDataset(root_dir, classes = classes, transform = transforms.Compose([transforms.ToTensor()])) batch_size = 4 num_workers = 0 valid_size = 0.2 # obtain training indices that will be used for validation num_train = len(train_data_CB) indices = list(range(num_train)) np.random.shuffle(indices) split = int(np.floor(valid_size * num_train)) train_idx, valid_idx = indices[split:], indices[:split] # define samplers for obtaining training and validation batches train_sampler = SubsetRandomSampler(train_idx) valid_sampler = SubsetRandomSampler(valid_idx) # prepare data loaders (combine dataset and sampler) train_loader = torch.utils.data.DataLoader(train_data_CB, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers) valid_loader = torch.utils.data.DataLoader(train_data_CB, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers) correspondence_block = UNET.UNet(n_channels = 3, out_channels_id = 9, out_channels_uv = 256, bilinear=True) correspondence_block.cuda() criterion_id = nn.CrossEntropyLoss() criterion_u = nn.CrossEntropyLoss() criterion_v = nn.CrossEntropyLoss() # specify optimizer optimizer = optim.Adam(correspondence_block.parameters(), lr=3e-4,weight_decay=3e-5) # Training Loop # number of epochs to train the model n_epochs = epochs valid_loss_min = np.Inf # track change in validation loss for epoch in range(1, n_epochs+1): # keep track of training and validation loss train_loss = 0.0 valid_loss = 0.0 ################### # train the model # ################### correspondence_block.train() for image, idmask,umask,vmask in train_loader: # move tensors to GPU image, idmask,umask,vmask = image.cuda(), idmask.cuda(), umask.cuda(), vmask.cuda() # clear the gradients of all optimized variables optimizer.zero_grad() # forward pass: compute predicted outputs by passing inputs to the model idmask_pred,umask_pred,vmask_pred = correspondence_block(image) # calculate the batch loss loss_id = criterion_id(idmask_pred, idmask) loss_u = criterion_u(umask_pred, umask) loss_v = criterion_v(vmask_pred, vmask) loss = loss_id + loss_u + loss_v # backward pass: compute gradient of the loss with respect to model parameters loss.backward() # perform a single optimization step (parameter update) optimizer.step() # update training loss train_loss += loss.item() ###################### # validate the model # ###################### correspondence_block.eval() for image, idmask,umask,vmask in valid_loader: # move tensors to GPU image, idmask,umask,vmask = image.cuda(), idmask.cuda(), umask.cuda(), vmask.cuda() # forward pass: compute predicted outputs by passing inputs to the model idmask_pred,umask_pred,vmask_pred = correspondence_block(image) # calculate the batch loss loss_id = criterion_id(idmask_pred, idmask) loss_u = criterion_u(umask_pred, umask) loss_v = criterion_v(vmask_pred, vmask) loss = loss_id + loss_u + loss_v # update average validation loss valid_loss += loss.item() # calculate average losses train_loss = train_loss/len(train_loader.sampler) valid_loss = valid_loss/len(valid_loader.sampler) # print training/validation statistics print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format( epoch, train_loss, valid_loss)) # save model if validation loss has decreased if valid_loss <= valid_loss_min: print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format( valid_loss_min, valid_loss)) torch.save(correspondence_block.state_dict(), 'correspondence_block.pt') valid_loss_min = valid_loss