CUDA error: device-side assert triggered while calculating loss for LSTM

Dylan_Yung · March 1, 2020, 10:43am

I have this code

def lstmTrain(rnn: RNN, input_line_tensor: torch.Tensor, target_line_tensor: torch.Tensor):

    rnn.train()

    criterion = nn.NLLLoss()

    hidden = rnn.initHidden()

    hidden = (hidden[0].to(DEVICE), hidden[1].to(DEVICE))

    rnn.zero_grad()

    loss = 0

    for i in range(input_line_tensor.size(0)):

        # input_line_tensor[i] is [1, input categories]

        output, hidden = rnn(input_line_tensor[i].unsqueeze(1), hidden)

        # output is [1,1,output categories]

        # target_line_tensor[i] is [1,30]

        l = criterion(output, target_line_tensor[i])

        loss += l

    loss.backward()

    for p in rnn.parameters():

        p.data.add_(-LR, p.grad.data)

    return output, loss.item() / input_line_tensor.size(0)

at loss += l it’s throwing the error in the title. Here is my rnn code as well

import torch

import torch.nn as nn

class LSTM(nn.Module):

    def __init__(self, input_size: int, num_layers: int, hidden_sz: int, output_size: int):

        super(LSTM, self).__init__()

        self.hidden_size = hidden_sz

        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size= input_size, hidden_size=hidden_sz, num_layers=num_layers)

        self.fc1 = nn.Linear(hidden_sz, output_size)

        self.dropout = nn.Dropout(0.1)

        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input: torch.Tensor, hidden: torch.Tensor):

        output, hidden = self.lstm(input, hidden)

        output = self.fc1(output)

        output = self.dropout(output)

        output = self.softmax(output)

        return output, hidden

    def initHidden(self, batch_size=1):

        return (torch.zeros(self.num_layers, batch_size, self.hidden_size),

                torch.zeros(self.num_layers, batch_size, self.hidden_size))

ptrblck · March 2, 2020, 1:16am

Could you run the code with CUDA_LAUNCH_BLOCKING=1 python script.py args and post the stack trace?

Dylan_Yung · March 2, 2020, 7:48am

I got

Exception has occurred: RuntimeError
cuda runtime error (710) : device-side assert triggered at C:/w/1/s/windows/pytorch/aten/src\THCUNN/generic/SpatialClassNLLCriterion.cu:127

ptrblck · March 2, 2020, 8:04am

This error is most likely thrown, if you pass an invalid index.
nn.CrossEntropyLoss (and thus also nn.NLLLoss, which will be called internally) expect a target tensor as a LongTensor containing the class indices in the range [0, nb_classes-1].
This error will be thrown, if you pass indices outside of the mentioned range.