RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED in embedding layer

dhecloud · February 13, 2019, 4:29am

Hello, I have this module where I try to add a positional embedding to the Word Embeddings. However, there is no error in the below code. However, when I use the commented line in the forward pass below, it results in a RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED error.

Code:

class QuestionEmbeddingLayer(nn.Module):

    def __init__(self, vocab, args):
        super().__init__()
        
        self.word_embedding = nn.Embedding(num_embeddings=vocab.shape[0],embedding_dim=vocab.shape[1]) # dim=300
        torch.nn.init.xavier_uniform(self.word_embedding.weight)
        self.word_embedding.weight.requires_grad = False
        self.word_embedding.weight.data.copy_(vocab)
        
        self.position_enc = nn.Embedding(num_embeddings=150,embedding_dim=300)
        tmp = get_sinusoid_encoding_table(150, 300, padding_idx=0)
        self.position_enc.weight.data.copy_(tmp)
        self.position_enc.weight.requires_grad = False

        self.bilstm = nn.LSTM(input_size=300, hidden_size=int(args.d_model/2), num_layers=2, dropout=0.1, bidirectional=True, batch_first=False)

    def forward(self, q):
        
        word_emb = self.word_embedding(q.transpose(0,1))
        # word_emb = self.word_embedding(q.transpose(0,1)) + self.position_enc(q.transpose(0,1) <- this gives error!!!!!!
        output, (hidden, cell) = self.bilstm(word_emb)
        q = torch.cat((hidden[-1],hidden[-2]),1)
        
        # (batchsize, hiddensize*2)
        
        return q

def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    ''' Sinusoid position encoding table '''

    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)

    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1

    if padding_idx is not None:
        # zero vector for padding dimension
        sinusoid_table[padding_idx] = 0
    
    print(sinusoid_table.shape)
    return torch.Tensor(sinusoid_table).cuda()

Any idea why this is acting this way? I’ve checked the shapes, they are all correct. Im using Cuda 9, Pytorch 1.0.0

pramod.srinivasan · February 13, 2019, 7:05am

Might be because the self.word_embedding or q is not in GPU. Since the rest of the code is not available, have you checked if these tensors are in GPU by explicitly setting them to your_tensor.to(torch.device("gpu"))?

mdehouck · September 26, 2019, 11:31am

I just got the same problem on a piece of code that was otherwise running flawlessly.
The difference is that I was concatenating rather than summing the embedding.

I figured out that the error appeared when the size of the embedding is too small.
I mean, not the dimension but the number of item to embed.
In my case, embedding 4 items was problematic, in fact it was up to 13 but 14 worked fine.
So I just added 10 dummy tokens.

What is weird indeed is that the error only appears when you start to use those embedding in cat or sim.

Maybe someone knows why small embeddings are a problem.

Best

ptrblck · September 30, 2019, 8:04am

That’s kind of a weird issue. Do you have a small code example, so that we can have a look?

mdehouck · October 1, 2019, 10:06am

Here it is.

It is a bit longish, but it is self contained

import torch
import torch.nn as nn
from torch import tensor, cat
from tqdm import tqdm, trange
from random import seed

torch.manual_seed(0)
seed(0)

c = ['(', ')', '+', '-', '*', '/',
 '0', '1', '2', '3', '4', '5',
 '6', '7', '8', '9', ' ']
ioc = {c[i]:i for i in range(len(c))}


toc = {'(':'PAR', ')':'PAR',
   '+':'OP', '-':'OP' ,'*':'OP' ,'/':'OP' ,
   '0':'NUM', '1':'NUM', '2':'NUM', '3':'NUM', '4':'NUM', '5':'NUM', '6':'NUM', '7':'NUM', '8':'NUM', '9':'NUM',
   ' ':'SPC'}

types = list(toc.values())
types.sort()

iop = {types[i]:i for i in range(len(types))}




class Typed_LSTM(nn.Module):

    def __init__(self, input_dim, hidden_dim, num_layer, device):
        nn.Module.__init__(self)

        self.hidden_dim = hidden_dim
        self.device = device

        self.LSTM = {'_':nn.LSTM(input_dim, hidden_dim, num_layer)}
        for t, l in self.LSTM.items():
            self.add_module('LSTM'+t, l)


    def forward(self, xts):
        b = xts.shape[0]

        states = []

        hc = None
        for xt in xts:
            xt = xt.reshape((1, xt.shape[0], xt.shape[1]))
            s, hc = self.LSTM['_'](xt, hc)
        
            states.append(s)

        states = cat(states)
        
        return states



class PSolver(nn.Module):

    def __init__(self, ioc, iop, emb_dim, pos_dim, hidden_dim, num_layer, types, device):
        nn.Module.__init__(self)
        self.device = device
    
        self.ioc = ioc
        self.iop = iop
        #self.P = nn.Embedding(len(self.iop)+20, pos_dim) # add some dummy embeddings to avoid the cudnn bug
        self.P = nn.Embedding(len(self.iop), pos_dim) 
        self.E = nn.Embedding(len(self.ioc), emb_dim) 

        self.LSTM = Typed_LSTM(emb_dim+pos_dim, hidden_dim, num_layer, device)
        #self.LSTM = nn.LSTM(emb_dim+pos_dim, hidden_dim)

        self.H = nn.Linear(hidden_dim, hidden_dim)
        self.O = nn.Linear(hidden_dim, 1, bias=True)

        self.loss = nn.L1Loss(reduction='sum')
    
        self.trainer = torch.optim.Adadelta(self.parameters())

        self.to(device)


    def encode(self, exp, tags):
        emb = self.E(tensor([[self.ioc[c] for c in exp]], device=self.device)).transpose(0,1)
        pemb = self.P(tensor([[self.iop[p] for p in tags]], device=self.device)).transpose(0,1)

        emb = cat([emb, pemb], dim=2)
        state = self.LSTM(emb)[-1][-1]
        value = self.H(state)        
        value = self.O(value)
    
        return value


    def test(self, exp, tags):

        value = self.encode(exp, tags)
    
        return value.item()



device = torch.device('cuda:0')

solver = PSolver(ioc, iop, 10, 10, 50, 1, types, device)

exp = '1+2-3+4-5+6-7'
tags = ['NUM', 'OP', 'NUM', 'OP', 'NUM', 'OP', 'NUM', 'OP', 'NUM', 'OP', 'NUM', 'OP', 'NUM']

v = solver.test(exp, tags)

The problems is around line 69/70 where adding a few dummy embeddings resolves the cuDNN problem.

The problem otherwise appears with both the “house” lstm and torch native lstm.

At times, on top of the cuDNN error, it also throws a bunch of :
/pytorch/aten/src/THC/THCTensorIndex.cu:308: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = 2, SrcDim = 2, IdxDim = -2]: block: [0,0,0], thread: [0,0,0] Assertion srcIndex < srcSelectDimSize failed.

But it is not alway easy to reproduce this one.

If it helps I’ll be glad to know.

Best