`CUDNN_STATUS_NOT_SUPPORTED` with nvida profiler

I am running into a strange cudnn error when trying to profile a network with nvprof.

The model runs fine without the profiler, but when I add the profiler with nvprof --metrics flop_count_sp,flop_count_hp --log-file mylog.log python main.py I get the following error:

Traceback (most recent call last):
  File "main.py", line 285, in <module>
    main()
  File "main.py", line 272, in main
    loss = train(epoch, train_loader, optimizer, criterion, train_size, args)
  File "main.py", line 100, in train
    out_subsite, out_laterality, out_behavior, out_histology, out_grade = model(sentence)
  File "/home/ygx/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ygx/support/mtcnn/hongjunsynth/model.py", line 138, in forward
    conv_results.append(self.convblock1(x).view(-1, self.num_filters1))
  File "/home/ygx/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ygx/anaconda3/lib/python3.6/site-packages/torch/nn/modules/container.py", line 91, in forward
    input = module(input)
  File "/home/ygx/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ygx/anaconda3/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 176, in forward
    self.padding, self.dilation, self.groups)
RuntimeError: CUDNN_STATUS_NOT_SUPPORTED. This error may appear if you passed in a non-contiguous input.

Here is a quick sanity check for the model:

import torch
import torch.nn as nn


class MTCNN(nn.Module):
    def __init__(self, kernel1=3, kernel2=4, kernel3=5, kernel4=6, num_filters1=50,
                 num_filters2=50, num_filters3=50, num_filters4=50, dropout1=0.5, dropout2=0.5, dropout3=0.5,
                 dropout4=0.0, max_sent_len=3000, word_dim=256, vocab_size=35095, subsite_size=34, #35093
                 laterality_size=4, behavior_size=3, grade_size=5, histology_size=44, alt_model_type=None):
        super(MTCNN, self).__init__()
        """Multi-task CNN"""
        self.kernel1 = kernel1
        self.kernel2 = kernel2
        self.kernel3 = kernel3
        self.kernel4 = kernel4
        self.num_filters1 = num_filters1
        self.num_filters2 = num_filters2
        self.num_filters3 = num_filters3
        self.num_filters4 = num_filters4
        self.max_sent_len = max_sent_len
        self.dropout1 = dropout1
        self.dropout2 = dropout2
        self.dropout3 = dropout3
        self.dropout4 = dropout4
        self.word_dim = word_dim
        self.vocab_size = vocab_size
        self.subsite_size = subsite_size
        self.laterality_size = laterality_size
        self.histology_size = histology_size
        self.behavior_size = behavior_size
        self.grade_size = grade_size
        self.alt_model_type = alt_model_type
        self._filter_sum = None
        self._sum_filters()

        self.embedding = nn.Embedding(self.vocab_size + 2, self.word_dim, padding_idx=0)
        #self.embedding.weight.data.copy_(torch.from_numpy(self.wv_matrix))

        if self.alt_model_type == 'static':
            self.embedding.weight.requires_grad = False
        elif self.alt_model_type == 'multichannel':
            self.embedding2 = nn.Embedding(self.vocab_size + 100, self.word_dim, padding_idx=self.vocab_size + 1)
            self.embedding2.weight.data.copy_(torch.from_numpy(self.wv_matrix))
            self.embedding2.weight.requires_grad = False
            self.IN_CHANNEL = 2

        self.convblock1 = nn.Sequential(
            nn.Conv1d(1, self.num_filters1, self.kernel1),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Dropout(p=self.dropout1)
        )

        self.convblock2 = nn.Sequential(
            nn.Conv1d(1, self.num_filters2, self.kernel2),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Dropout(p=self.dropout2)
        )

        self.convblock3 = nn.Sequential(
            nn.Conv1d(1, self.num_filters3, self.kernel3),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Dropout(p=self.dropout3)
        )

        self.convblock4 = nn.Sequential(
            nn.Conv1d(1, self.num_filters4, self.kernel4),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Dropout(p=self.dropout4)
        )

        self.fc1 = nn.Linear(self._filter_sum, self.subsite_size)
        self.fc2 = nn.Linear(self._filter_sum, self.laterality_size)
        self.fc3 = nn.Linear(self._filter_sum, self.behavior_size)
        self.fc4 = nn.Linear(self._filter_sum, self.histology_size)
        self.fc5 = nn.Linear(self._filter_sum, self.grade_size)

    def _sum_filters(self):
        """Get the total number of convolutional filters."""
        self._filter_sum = self.num_filters1 + self.num_filters2 + self.num_filters3 + self.num_filters4

    def forward(self, x):
        #x = self.embedding(x).view(-1, 1, 3000)
        x = self.embedding(x).view(-1, 1, self.word_dim * self.max_sent_len)
        if self.alt_model_type == "multichannel":
            x2 = self.embedding2(x).view(-1, 1, self.word_dim * self.max_sent_len)
            x = torch.cat((x, x2), 1)

        conv_results = []
        conv_results.append(self.convblock1(x).view(-1, self.num_filters1))
        conv_results.append(self.convblock2(x).view(-1, self.num_filters2))
        conv_results.append(self.convblock3(x).view(-1, self.num_filters3))
        conv_results.append(self.convblock4(x).view(-1, self.num_filters4))
        x = torch.cat(conv_results, 1)

        out_subsite = self.fc1(x)
        out_laterality = self.fc2(x)
        out_behavior = self.fc3(x)
        out_histology = self.fc4(x)
        out_grade = self.fc5(x)
        return out_subsite, out_laterality, out_behavior, out_histology, out_grade

model = MTCNN()
model = model.cuda().half()

x = torch.arange(0, 3000, dtype=torch.long).cuda()
model(x)

Has anyone run into cudnn errors when running nvidia’s profiler?

Here is my setup:

pytorch version: 0.4.0
cudnn version: 7102
cuda 9.0

Not sure if that would solve the problem (haven’t used the profiler, yet), but maybe it only works with contiguous arrays/tensors (based on the error message you got)? In that case, maybe try to append a .contiguous() to all the places where you use view. E.g.,

self.convblock1(x).view(-1, self.num_filters1).contiguous()

instead of just

self.convblock1(x).view(-1, self.num_filters1)

Or maybe just check which ones are non-contiguous first by using .is_contiguous() and apply it only to tensors of necessary