Cublas runtime error on GPU running, but works on CPU

Firstly this is my environment:

  • one RTX2060 card;
  • Debian buster;
  • CUDA 9.2 driver installed from Debian’s apt repository.
  • Python 3.6 and pytorch installed by anaconda
  • I have tried a tiny GPU run in pytorch and it works.

The problem is: the network works on CPU, but when I try to put it on GPU, it claims:

Traceback (most recent call last):
  File "./do_training.py", line 95, in <module>
    batch_output = model(batch_input)
  File "/opt/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "/large/yangxi/MonotoneLearn/train_for_pitch/MyModel.py", line 32, in forward
    x = self.full_part.forward(x)
  File "/opt/anaconda3/lib/python3.6/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "/opt/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "/opt/anaconda3/lib/python3.6/site-packages/torch/nn/modules/linear.py", line 92, in forward
    return F.linear(input, self.weight, self.bias)
  File "/opt/anaconda3/lib/python3.6/site-packages/torch/nn/functional.py", line 1406, in linear
    ret = torch.addmm(bias, input, weight.t())
RuntimeError: cublas runtime error : the GPU program failed to execute at /opt/conda/conda-bld/pytorch_1556653183467/work/aten/src/THC/THCBlas.cu:259

This error can appear when tensor dimension mismatch. But the whole thing works on CPU, so it won’t have unmatched tensor size.

This is my source code:

do_training.py

#!/opt/anaconda3/bin/python3

import pysndfile
import optparse
import sys
import os
import re

import math
import random
import numpy
import fnmatch
import h5py
import torch
import MyModel

# parse options
opt_parser = optparse.OptionParser()
opt_parser.add_option("-d", "--data", dest="f_data", help="HDF5 file contains training data, with \"input\" and \"refout\" dataset.", metavar="FILE")
opt_parser.add_option("-i", "--in", dest="f_in", help="Existing network file to continue training.", metavar="FILE")
opt_parser.add_option("-c", "--conv-layers", dest="n_conv_layers", help="Number of 3-length convolution layers for creating new network.", type="int", metavar="NUM_LAYER")
opt_parser.add_option("-f", "--full-layers", dest="n_full_layers", help="Number of fully connected layers for creating new network.", type="int", metavar="NUM_LAYER")
opt_parser.add_option("-o", "--out", dest="f_out", help="Output file to store trained network.", metavar="FILE")
opt_parser.add_option("-t", "--iter", dest="n_iter", help="Number of training iterations to perform.", type="int", metavar="NUM_ITER", default=1000)
opt_parser.add_option("-b", "--batch", dest="batch_sz", help="Size of each training iteration.", type="int", metavar="BATCH_SIZE", default=10)
opt_parser.add_option("-D", "--device", dest="dev_type", help="Device. Default to CUDA when available, or CPU when no GPU is available.")
options,_ = opt_parser.parse_args()

if options.f_data is None:
    print("data file is not specified")
    exit(-1)
if options.f_in is None and options.n_full_layers is None:
    print("neither input file nor layer number is specified")
    exit(-1)
if options.f_out is None:
    print("output file is not specified")
    exit(-1)

# determine device
if options.dev_type is None:
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
else:
    device = torch.device(options.dev_type)

# read data properties
fh_data = h5py.File(options.f_data, 'r')
if "input" not in fh_data:
    print("no input in data file %s" % options.f_data)
    exit(-1)
if "refout" not in fh_data:
    print("no refout in data file %s" % options.f_data)
    exit(-1)

if len(fh_data["input"]) != len(fh_data["refout"]):
    print("inequal number of data: input %d, ref output %d" % (len(fh_data["input"]), len(fh_data["refout"])))
    exit(-1)

num_data = len(fh_data["input"])
input_size = len(fh_data["input"][0])

whole_input = torch.from_numpy(fh_data["input"].__array__()).to(device)
whole_refout = torch.from_numpy(fh_data["refout"].__array__()).to(device)
print(whole_input)
print(whole_refout)

# create model
model = None
if options.f_in is None:
    model = MyModel.MyModel(input_size, options.n_conv_layers, options.n_full_layers).to(device)
else:
    model = torch.load(options.f_in).to(device)
print(model)
print(model.parameters())

# do training
#fcri = torch.nn.NLLLoss()
cri = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

batch_input = torch.empty( (options.batch_sz, input_size), dtype=torch.float32, device=device )
batch_refout = torch.empty( options.batch_sz, dtype=torch.long, device=device )

for i in range(options.n_iter):
    try:
        optimizer.zero_grad()
        
        for j in range(options.batch_sz):
            i_data = random.randint(0, num_data-1)
            batch_input[j] = whole_input[i_data]
            batch_refout[j] = whole_refout[i_data]
            
        batch_output = model(batch_input)
        loss = cri(batch_output, batch_refout)
        if i % 100 == 0:
            print( "iter %d: loss %f" % (i, float(loss)) )
        loss.backward()
        optimizer.step()
    except KeyboardInterrupt:
        print("early out by int at iter %d" % i)
        break

# save result
torch.save(model, options.f_out)

MyModel.py:

import torch
import collections

class MyModel(torch.nn.Module):
    def __init__(self, input_sz, n_conv_layer, n_full_layer):
        super(MyModel, self).__init__()
        self.input_size = input_sz
        conv_layers = collections.OrderedDict()
        for i in range(n_conv_layer):
            conv_name = "conv_%d" % i
            conv_layers[conv_name] = torch.nn.Conv1d(1,1,3,padding=1)
        self.conv_part = torch.nn.modules.Sequential(conv_layers)

        full_layers = collections.OrderedDict()
        for i in range(n_full_layer-1):
            full_conn_name = "full_conn_%d" % i
            non_linear_name = "acti_%d" % i
            full_layers[full_conn_name] = torch.nn.Linear(input_sz,input_sz)
            full_layers[non_linear_name] = torch.nn.Tanh()
        full_layers["last_full_conn"] = torch.nn.Linear(input_sz, 128)
        #full_layers["last_acti"] = torch.nn.LogSoftmax()
        self.full_part = torch.nn.modules.Sequential(full_layers)

    def forward(self, x):
        print("input shape ",x.size())
        batch_sz = len(x)
        x = x.view(batch_sz, 1, self.input_size)
        x = self.conv_part.forward(x)
        print("convolution part output shape ",x.size())
        x = x.view(batch_sz, self.input_size)
        print("full connect part input shape ",x.size())
        x = self.full_part.forward(x)
        return x

I have put some logs on forward function. Before it is dead, I can see outputs:

input shape  torch.Size([10, 1025])
convolution part output shape  torch.Size([10, 1, 1025])
full connect part input shape  torch.Size([10, 1025])

Which is expected.

Did you install the PyTorch binaries or did you build from source?
In the former case the binaries will already ship with CUDA.
Could you check the CUDA version you are using by printing torch.version.cuda?
Since you are using a Turing GPU, CUDA10 is recommended.

I installed pytorch from anaconda which is official suggested way, and I have tried a success tiny GPU run in pytorch.

My CUDA version is 9.0.176.

Could you install CUDA10, as this is the recommended version for Turing GPUs:

conda install pytorch torchvision cudatoolkit=10.0 -c pytorch

I will try to install CUDA 10. It may have some difficulties because Debian only provides CUDA 9 in their official apt repository, and Nvidia website don’t have Debian CUDA package (only Ubuntu package).

Moreover, I think CUDA 9 should at least work? Why my code won’t work at all?

You don’t need to install CUDA on your system, as the PyTorch binaries will already ship with the specified CUDA libs. Could you try to install PyTorch using the posted command?

It works! Thanks for a lot!

Hi, I think I have the same issue, but what is not normal is that I run my code (a bert classifier for sentiment analysis) with a given dataset it worked but when I change the dataset (bigger than the first), it showed me this error : runtime error : the GPU program failed to execute at /pytorch/aten/src/THC/THCBlas.cu:23

@ptrblck Sir what can I do to solve this problem ?

I work on google colab

Could you try to reduce the batch size and rerun the code to eliminate the possibility that you are running out of memory and cublas is just reporting this unhelpful error?
I’m not sure, if there is a proper way to see the GPU memory in Colab, but this might be another way to check for the potential OOM issue.

It didn’t work (reducing the batch size).
the first dataset I used (when the code worked) is not large as the actual one (concerning the train dataset is a 7.5 Mb .txt file with a lot of data)

Thanks for the update.
Could you post the shapes of all tensors used in the label_loss = torch.matmul(...) line of code, so that we could try to reproduce it?

PS: you can post code snippets by wrapping them into three backticks ```, which makes debugging easier. :wink:

def one_fold(num_fold, train_index, dev_index):
        print("Training on fold:", num_fold)
        X_train, X_dev = [X[i] for i in train_index], [X[i] for i in dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        # construct data loader
        train_data_set = DataSet(X_train, y_train, SENT_PAD_LEN)
        train_data_loader = DataLoader(train_data_set, batch_size=BATCH_SIZE, shuffle=True)

        dev_data_set = DataSet(X_dev, y_dev, SENT_PAD_LEN)
        dev_data_loader = DataLoader(dev_data_set, batch_size=BATCH_SIZE, shuffle=False)
        gradient_accumulation_steps = 1
        num_train_steps = int(
            len(train_data_set) / BATCH_SIZE / gradient_accumulation_steps * MAX_EPOCH)

        pred_list_test_best = None
        final_pred_best = None
        # This is to prevent model diverge, once happen, retrain
        while True:
            is_diverged = False
            model = BERT_classifer.from_pretrained(BERT_MODEL)
            model.add_output_layer(BERT_MODEL, NUM_EMO)
            model = nn.DataParallel(model)
            model.cuda()

            # BERT optimizer
            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'gamma', 'beta']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                 'weight_decay_rate': 0.01},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
            ]

            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=learning_rate,
                                 warmup=0.1,
                                 t_total=num_train_steps)

            if w == 1:
                weight_list = [0.3, 0.3, 0.3, 1.7]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]
            elif w == 2:
                weight_list = [0.3198680179, 0.246494733, 0.2484349259, 1.74527696]
                weight_list_binary = [2 - weight_list[-1], weight_list[-1]]

            weight_list = [x**FLAT for x in weight_list]
            weight_label = torch.Tensor(weight_list).cuda()

            weight_list_binary = [x**FLAT for x in weight_list_binary]
            weight_binary = torch.Tensor(weight_list_binary).cuda()
            print('binary loss reweight = weight_list_binary', weight_list_binary)
            # loss_criterion_binary = nn.CrossEntropyLoss(weight=weight_list_binary)  #
            loss_criterion = nn.CrossEntropyLoss(reduce=False)
            loss_criterion_binary = nn.CrossEntropyLoss(reduce=False)  #

            loss_criterion_emo_only = nn.MSELoss()

            # es = EarlyStopping(min_delta=0.005, patience=EARLY_STOP_PATIENCE)
            es = EarlyStopping(patience=EARLY_STOP_PATIENCE)
            final_pred_best = None
            final_pred_list_test = None
            pred_list_test = None
            for num_epoch in range(MAX_EPOCH):
                print('Begin training epoch:', num_epoch)
                sys.stdout.flush()
                train_loss = 0
                model.train()
                for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in tqdm(enumerate(train_data_loader),
                                                              total=len(train_data_set)/BATCH_SIZE):
                    optimizer.zero_grad()

                    if USE_TOKEN_TYPE:
                        pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(), segments.cuda())
                    else:
                        pred, pred2, pred3 = model(tokens.cuda(), masks.cuda())

                    loss_label = loss_criterion(pred, e_c.view(-1).cuda()).cuda()
                    loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                 e_c.view(-1).shape[0]
                    loss_binary = loss_criterion_binary(pred2, e_c_binary.view(-1).cuda()).cuda()
                    loss_binary = torch.matmul(torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()),
                                               loss_binary) / e_c.view(-1).shape[0]

                    loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda())

                    loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                    # training trilogy
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
                    optimizer.step()

                    train_loss += loss.data.cpu().numpy() * tokens.shape[0]

                    del loss, pred

                # Evaluate
                model.eval()
                dev_loss = 0
                # pred_list = []
                # gold_list = []
                for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(dev_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, pred2, pred3 = model(tokens.cuda(), masks.cuda(), segments.cuda())
                        else:
                            pred, pred2, pred3 = model(tokens.cuda(), masks.cuda())

                        loss_label = loss_criterion(pred, e_c.view(-1).cuda()).cuda()
                        loss_label = torch.matmul(torch.gather(weight_label, 0, e_c.view(-1).cuda()), loss_label) / \
                                     e_c.view(-1).shape[0]

                        loss_binary = loss_criterion_binary(pred2, e_c_binary.view(-1).cuda()).cuda()
                        loss_binary = torch.matmul(torch.gather(weight_binary, 0, e_c_binary.view(-1).cuda()),
                                                   loss_binary) / e_c.view(-1).shape[0]

                        loss_emo = loss_criterion_emo_only(pred3, e_c_emo.cuda())

                        loss = (loss_label + LAMBDA1 * loss_binary + LAMBDA2 * loss_emo) / float(1 + LAMBDA1 + LAMBDA2)

                        dev_loss += loss.data.cpu().numpy() * tokens.shape[0]

                        # pred_list.append(pred.data.cpu().numpy())
                        # gold_list.append(e_c.numpy())
                        del pred, loss

                # pred_list = np.argmax(np.concatenate(pred_list, axis=0), axis=1)
                # gold_list = np.concatenate(gold_list, axis=0)
                print('Training loss:', train_loss / len(train_data_set), end='\t')
                print('Dev loss:', dev_loss / len(dev_data_set))
                # print(classification_report(gold_list, pred_list, target_names=EMOS))
                # get_metrics(pred_list, gold_list)
                # checking diverge
                if dev_loss/len(dev_data_set) > 1.3 and num_epoch > 4:
                    print("Model diverged, retry")
                    is_diverged = True
                    break

                if es.step(dev_loss):  # overfitting
                    print('overfitting, loading best model ...')
                    if num_epoch == 1:
                        is_diverged = True
                        final_pred_best = deepcopy(final_pred_list_test)
                        pred_list_test_best = deepcopy(pred_list_test)
                    break
                else:
                    if es.is_best():
                        print('saving best model ...')
                        if final_pred_best is not None:
                            del final_pred_best
                        final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is not None:
                            del pred_list_test_best
                        pred_list_test_best = deepcopy(pred_list_test)
                    else:
                        print('not best model, ignoring ...')
                        if final_pred_best is None:
                            final_pred_best = deepcopy(final_pred_list_test)
                        if pred_list_test_best is None:
                            pred_list_test_best = deepcopy(pred_list_test)

                print('Gold Dev ...')
                pred_list_test = []
                model.eval()
                for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(gold_dev_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, _, _ = model(tokens.cuda(), masks.cuda(), segments.cuda())
                        else:
                            pred, _, _ = model(tokens.cuda(), masks.cuda())
                        pred_list_test.append(pred.data.cpu().numpy())

                pred_list_test = np.argmax(np.concatenate(pred_list_test, axis=0), axis=1)
                # get_metrics(load_dev_labels('data/dev.txt'), pred_list_test)

                print('Gold Test ...')
                final_pred_list_test = []
                model.eval()
                for i, (tokens, masks, segments, e_c, e_c_binary, e_c_emo) in enumerate(gold_test_data_loader):
                    with torch.no_grad():
                        if USE_TOKEN_TYPE:
                            pred, _, _ = model(tokens.cuda(), masks.cuda(), segments.cuda())
                        else:
                            pred, _, _ = model(tokens.cuda(), masks.cuda())
                        final_pred_list_test.append(pred.data.cpu().numpy())

                final_pred_list_test = np.argmax(np.concatenate(final_pred_list_test, axis=0), axis=1)
                # get_metrics(load_dev_labels('data/test.txt'), final_pred_list_test)

            if is_diverged:
                print("Reinitialize model ...")
                del model
                continue
            all_fold_results.append(pred_list_test_best)
            real_test_results.append(final_pred_best)

            del model
            break

I found this debugging : CUDA_LAUNCH_BLOCKING = 1, but this command stop cuda from working and my code depends of cuda (GPU) like you see.

Thanks for the code.
Unfortunately it’s not executable, as no variables are defined.
Could you post a standalone code snippet, which we could use for debugging?

1 Like

https://colab.research.google.com/drive/1JqNOY_TOd0Yk2RJZmHeyWZ9DAF644FHe?usp=sharing

It seems work now but the training process is taking a lot of time, it’s very slow, when I searched I found that BERT method is in general slow, but in this case it may take hours or may be days, any idea ?

@ptrblck, Sir what do you think ?