Getting "RuntimeError: CUDA error: device-side assert triggered" for an LSTM network

I’m trying to replicate the tutorial posted on Medium that demonstrates the utilization of an LSTM network for sentiment analysis. During the execution via PyTorch, getting this error: RuntimeError: CUDA error: device-side assert triggered. Could you please explain me what is the reason for this error?

Full error stacktrace:


RuntimeError Traceback (most recent call last)

[<ipython-input-16-40911770614d>]( in <module>() 197 198 if __name__ == '__main__': --> 199 main()

4 frames

[/usr/local/lib/python3.6/dist-packages/torch/nn/modules/]( in convert(t) 439 if convert_to_format is not None and t.dim() == 4: 440 return, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format) --> 441 return, dtype if t.is_floating_point() else None, non_blocking) 442 443 return self._apply(convert)

RuntimeError: CUDA error: device-side assert triggered

Here is my script:

import random
import time

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from torchtext.datasets import IMDB


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

class LSTM_IMDB(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout=0.0, padding_idx=None, is_bidirectional=False):
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=is_bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.is_bidirectional = False

    def forward(self, text, sequence_length):
        embeddings = self.embedding(text)
        packed_embeddings = nn.utils.rnn.pack_padded_sequence(embeddings, 
        packed_output, (hidden_state, cell_state) = self.lstm(packed_embeddings)
        if self.is_bidirectional:
          output =[-2,:,:], hidden_state[-1,:,:]), dim = 1)
          output = hidden_state[-1,:,:]
          scores = self.fc(output)
        return scores

def binary_accuracy(preds, y):
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8

    # round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0


    for batch in iterator:

        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label)



        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0


    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)

            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    SEED = 407
    TEXT = data.Field(tokenize = 'spacy', lower=True, include_lengths = True)
    LABEL = data.LabelField(dtype = torch.float)

    train_data, test_data = IMDB.splits(TEXT, LABEL)
    train_data, valid_data = train_data.split(random_state = random.seed(SEED))

    print(f'Number of training examples: {len(train_data)}')
    print(f'Number of validation examples: {len(valid_data)}')
    print(f'Number of testing examples: {len(test_data)}')

    MAX_VOCAB_SIZE = 10_000


    BATCH_SIZE = 64
    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        sort_within_batch = True,

    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 300 # This needs to match the size of the pre-trained embeddings!
    HIDDEN_DIM = 256
    OUTPUT_DIM = 1
    num_layers = 3
    dropout = 0.5
    pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

    model = LSTM_IMDB(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, n_layers=num_layers,  dropout=dropout, padding_idx=pad_idx)

    # Initialize word embeddings
    glove_vectors = TEXT.vocab.vectors
    # Zero out <unk> and <pad> tokens
    unk_idx = TEXT.vocab.stoi[TEXT.unk_token][unk_idx] = torch.zeros(EMBEDDING_DIM)[pad_idx] = torch.zeros(EMBEDDING_DIM)

    lr = 1e-2
    criterion = nn.BCEWithLogitsLoss()
    criterion =
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    N_EPOCHS = 5

    best_valid_loss = float('inf')

    for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
  , '')

        print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

if __name__ == '__main__':

Is your code working fine on the CPU?
If so, could you rerun the code with CUDA_LAUNCH_BLOCKING=1 python args and post the stack trace here, please?

Thank you very much, now works like a charm! :clap:

My suggestion is not a fix, but a debugging step. :wink:
If I understand the issue correctly, your code is now working on the CPU as well as with CUDA_LAUNCH_BLOCKING=1 on the GPU, but raises the assert statement on a plain GPU run?