RuntimeError: cuda runtime error (77) : an illegal memory access was encountered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20

ashmeet13 · December 9, 2018, 9:01am

Hi,
I have just started to work with NLP and was working to create a model for Sentiment Analysis.

When I try to build my model on Google Colab I get the error that has been mentioned in the title.

model = SentiModel(vocab_size, embedding_dim, hidden_size, output_size)
if torch.cuda.is_available():
    model.cuda()
    print("Check Done")

The error specifically comes on the line when I call ----> model.cuda()

This is the complete error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-14-311fd88a1a30> in <module>()
    166 model = SentiModel(vocab_size, embedding_dim, hidden_size, output_size)
    167 if torch.cuda.is_available():
--> 168     model.cuda()
    169     print("Check Done")
    170 

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in cuda(self, device)
    256             Module: self
    257         """
--> 258         return self._apply(lambda t: t.cuda(device))
    259 
    260     def cpu(self):

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _apply(self, fn)
    183     def _apply(self, fn):
    184         for module in self.children():
--> 185             module._apply(fn)
    186 
    187         for param in self._parameters.values():

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _apply(self, fn)
    189                 # Tensors stored in modules are graph leaves, and we don't
    190                 # want to create copy nodes, so we have to unpack the data.
--> 191                 param.data = fn(param.data)
    192                 if param._grad is not None:
    193                     param._grad.data = fn(param._grad.data)

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in <lambda>(t)
    256             Module: self
    257         """
--> 258         return self._apply(lambda t: t.cuda(device))
    259 
    260     def cpu(self):

RuntimeError: cuda runtime error (77) : an illegal memory access was encountered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20

This is the model class I am using:

class SentiModel(nn.Module):
    """docstring for SentiModel"""
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
        super(SentiModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size


        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.RNN(embedding_size,hidden_size)
        self.fc = nn.Linear(hidden_size,output_size)

    def forward(self, x, lens):
        batch_size = x.size(1)
        self.hidden_out = self.init_hidden(batch_size)
        embeddings = self.embed(x)
        embbeddings_packed = pack_padded_sequence(embeddings, lens)
        rnn_out, self.hidden_out = self.rnn(embbeddings_packed, self.hidden_out)
        rnn_out, lens = pad_packed_sequence(rnn_out) 
        out = self.fc(self.hidden_out)
        return out

    def init_hidden(self, batch_size):
        if torch.cuda.is_available():
            return Variable(torch.zeros((1,batch_size,self.hidden_size)).cuda())
        else:
            return Variable(torch.zeros((1,batch_size,self.hidden_size)))

To recreate the error the following piece of code when ran on Google Colab produces a similar error:

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import torch.nn as nn
import torch


class SentiModel(nn.Module):
    """docstring for SentiModel"""
    def __init__(self, vocab_size, embedding_size, hidden_size, output_size):
        super(SentiModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.RNN(embedding_size,hidden_size)
        self.fc = nn.Linear(hidden_size,output_size)

    def forward(self, x, lens):
        batch_size = x.size(1) # batch size
        self.hidden_out = self.init_hidden(batch_size)
        embeddings = self.embed(x)
        embbeddings_packed = pack_padded_sequence(embeddings, lens)
        rnn_out, self.hidden_out = self.rnn(embbeddings_packed, self.hidden_out)
        rnn_out, lens = pad_packed_sequence(rnn_out) 
        out = self.fc(self.hidden_out)
        return out

    def init_hidden(self, batch_size):
        if torch.cuda.is_available():
            return Variable(torch.zeros((1,batch_size,self.hidden_size)).cuda())
        else:
            return Variable(torch.zeros((1,batch_size,self.hidden_size)))
          
          
vocab_size = 5000
embedding_dim = 5
hidden_size = 5
output_size = 2
epochs = 3000
batch_size = 3


model = SentiModel(vocab_size, embedding_dim, hidden_size, output_size)
if torch.cuda.is_available():
    model.cuda()
    print("Check Done")

Thank you.

pkrish · January 21, 2021, 8:36am

Hi!

Were you able to resolve this issue? I’m having the exact same problems…

Thanks

ptrblck · January 21, 2021, 11:05am

Could you post an executable code snippet as well as your current setup (PyTorch, CUDA, cudnn versions as well as the used GPU)?

pkrish · January 22, 2021, 7:28am

Thanks for your help!

I realised that it was a problem with my VM, since I wasn’t even able to move a tensor to the GPU. Rebooting appears to have fixed it now!