How to get rid of "cuDNN unspecified launch failure" error that happens after a few epochs?

Getting THCudaCheck FAIL file=..\aten\src\THC\THCCachingHostAllocator.cpp line=278 error=719 : unspecified launch failure error during the execution of epochs of my LSTM model after a few epochs. The error stacktrace points the line out, hidden = self.rnn(x, hidden) in the forward function as the reason for error.

Here is my network model:

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import numpy as np
import pandas as pd
from time import time


class SignalNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, num_layers):
        super(SignalNet, self).__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.rnn = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        batch_size = x.size(0)

        hidden = self.init_hidden(batch_size)

        out, hidden = self.rnn(x, hidden)

        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)

        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device),
                torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device))


def main():
    global device
    is_cuda = torch.cuda.is_available()

    if is_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    input_dim = 8
    batch_size = 1024  # was 32
    output_dim = 1
    num_layers = 5
    hidden_dim = 10
    learning_rate = 0.1
    num_epochs = 5

    model = SignalNet(input_size=input_dim, output_size=output_dim, hidden_dim=hidden_dim, num_layers=num_layers)
    model.to(device)

    loss_fn = torch.nn.MSELoss(reduction='sum')
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    df = pd.read_csv('data\\emg2.csv', sep=',')

    split_frac = 0.8  # 80% train, 20% test
    split_id = int(split_frac * len(df))

    train_data, train_labels = df.iloc[:split_id, :-1], df.iloc[:split_id, -1]
    test_data, test_labels = df.iloc[split_id:len(df) * 9 // 10, :-1], df.iloc[split_id:len(df) * 9 // 10, -1]
    val_data, val_labels = df.iloc[len(df) * 9 // 10:, :-1], df.iloc[len(df) * 9 // 10:, -1]

    # LSTM starts HERE
    train_dataset = TensorDataset(torch.from_numpy(train_data.values).float(),
                                  torch.from_numpy(train_labels.values).float())
    val_dataset = TensorDataset(torch.from_numpy(val_data.values).float(),
                                torch.from_numpy(val_labels.values).float())
    test_dataset = TensorDataset(torch.from_numpy(test_data.values).float(),
                                 torch.from_numpy(test_labels.values).float())

    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=6)
    val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size, num_workers=6)
    test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=6)

    # Training Run
    model.train()
    for epoch in range(1, num_epochs + 1):
        epoch_loss = 0
        index = 0
        for batch_idx, (X_train, y_train) in enumerate(train_loader):
            optimizer.zero_grad()  # Clears existing gradients from previous epoch
            X_train, y_train = X_train.to(device), y_train.to(device)
            # model.init_hidden(batch_size)
            output, hidden = model(X_train[np.newaxis, ...])
            # loss = loss_fn(output, y_train.view(-1).long())
            loss = loss_fn(output.view(-1), y_train.view(-1))
            epoch_loss += loss.item()
            loss.backward()  # Does backpropagation and calculates gradients
            optimizer.step()  # Updates the weights accordingly

            if batch_idx % 50 == 0:
                print(f'Epoch: {epoch}/{num_epochs} Batch #{batch_idx + 1}/{len(train_loader)}.............', end=' ')
                print("Loss: {:.4f}".format(loss.item()))
            index = batch_idx

        print(f'Epoch #{epoch}: Avg. loss: {epoch_loss / index + 1}')

    # TEST
    test_losses = []
    num_correct = 0
    h = model.init_hidden(batch_size)

    model.eval()
    for inputs, labels in test_loader:
        h = tuple([each.data for each in h])
        inputs, labels = inputs.to(device), labels.to(device)
        output, h = model(inputs[np.newaxis, ...])
        test_loss = loss_fn(output.squeeze(), labels.float())
        test_losses.append(test_loss.item())
        pred = torch.round(output.squeeze())  # Rounds the output to 0/1
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)

    print("Test loss: {:.3f}".format(np.mean(test_losses)))
    test_acc = num_correct / len(test_loader.dataset)
    print("Test accuracy: {:.3f}%".format(test_acc * 100))


if __name__ == '__main__':
    main()

Could you run your code with CUDA_LAUNCH_BLOCKING=1 python script.py args and post the stack trace here again, please?

1 Like

Thank you very much, now, I do not get any errors. :+1: Could you please briefly explain what was the issue and the effect of this parameter as I’d like to learn the insight to get more knowledge about PyTorch?

Oh, that’s not the solution for the issue.
The command blocks each CUDA call, which is originally called asynchronously, so it’s just useful for debugging.

Could you give me the shapes of train_data and train_labels so that I could run the code on my machine?
Also, which setup are you using (PyTorch version, CUDA, cudnn, how did you install PyTorch)?

Sorry for being late, @ptrblck but I have been extremely busy. The shapes of train_data and train_labels are (240000, 8), and (240000,), respectively. I installed PyTorch through pip, and my CUDA and cudnn versions are both 10.1. And, PyTorch version is 1.3.1. Feel free to ask any further information. Thanks.