cuda runtime error (59)

Hi everyone,
GPU error occured!
I am coding encoder-decoder model.

Is there some to solve my issue?

If you had same problem and solved it, could you help me?

class Encoder_Decoder(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(Encoder_Decoder, self).__init__()
        self.embed_input = nn.Embedding(input_size, hidden_size, padding_idx=-1)
        self.embed_target = nn.Embedding(output_size, hidden_size, padding_idx=-1)

        self.lstm1 = nn.LSTMCell(hidden_size, hidden_size)
        self.linear1 = nn.Linear(hidden_size, output_size)

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

    def forward(self, input_lines ,target_lines):
        # global all_loss
        hx = torch.zeros(batch_size, self.hidden_size).cuda()
        cx = torch.zeros(batch_size, self.hidden_size).cuda()
        for input_sentence_words in input_lines:
            input_k = self.embed_input(input_sentence_words)
            print("input_k", input_k)
            hx, cx = self.lstm1(input_k, (hx, cx) )
            print("hx", hx.size())
        return 0

Error message

input_k tensor([[ 1.1848,  0.1343,  0.4040,  ..., -0.9256, -0.0975,  0.1987],
        [ 1.2179, -1.4551, -0.1774,  ...,  0.9071,  1.3571,  0.6015],
        [ 1.2011,  1.2342, -0.8026,  ..., -1.4794, -0.7845,  0.2520],
        ...,
        [ 0.3803,  0.5644,  0.9808,  ..., -2.3397, -0.3587,  1.6716],
        [ 0.7521,  0.8567,  0.3936,  ...,  0.1330, -0.0766,  2.1656],
        [ 0.7806, -1.6959, -1.1628,  ..., -0.8085, -0.7975,  1.2291]], device='cuda:0')
hx torch.Size([10, 128])
.................................
input_k tensor([[ 0.3803,  0.5644,  0.9808,  ..., -2.3397, -0.3587,  1.6716],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0170,  0.0557, -0.4217,  ...,  1.0216,  2.1096,  0.2458],
        ...,
        [-0.4336, -0.3118,  0.4406,  ..., -1.4510,  0.8474, -1.0817],
        [-0.5032, -0.9890, -0.4692,  ...,  1.6164,  0.8813,  0.8150],
        [-0.4336, -0.3118,  0.4406,  ..., -1.4510,  0.8474, -1.0817]], device='cuda:0')
hx torch.Size([10, 128])

/pytorch/aten/src/THC/THCTensorIndex.cu:306: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = 2, SrcDim = 2, IdxDim = -2]: block: [0,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.

/pytorch/aten/src/THC/THCTensorIndex.cu:306: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = 2, SrcDim = 2, IdxDim = -2]: block: [0,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
.................................
/pytorch/aten/src/THC/THCTensorIndex.cu:306: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = 2, SrcDim = 2, IdxDim = -2]: block: [0,0,0], thread: [62,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/THC/THCTensorIndex.cu:306: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = 2, SrcDim = 2, IdxDim = -2]: block: [0,0,0], thread: [63,0,0] Assertion `srcIndex < srcSelectDimSize` failed.

THCudaCheck FAIL file=/pytorch/aten/src/THC/generic/THCTensorCopy.c line=70 error=59 : device-side assert triggered
input_k Traceback (most recent call last):
  File "pytorch.py", line 72, in <module>
    loss = model(Transposed_input, Transposed_target)
  File "/home/ochi/.pyenv/versions/3.6.3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "pytorch.py", line 42, in forward
    print("input_k", input_k)
  File "/home/ochi/.pyenv/versions/3.6.3/lib/python3.6/site-packages/torch/tensor.py", line 57, in __repr__
    return torch._tensor_str._str(self)
  File "/home/ochi/.pyenv/versions/3.6.3/lib/python3.6/site-packages/torch/_tensor_str.py", line 218, in _str
    fmt, scale, sz = _number_format(self)
  File "/home/ochi/.pyenv/versions/3.6.3/lib/python3.6/site-packages/torch/_tensor_str.py", line 79, in _number_format
    tensor = torch.DoubleTensor(tensor.size()).copy_(tensor).abs_().view(tensor.nelement())
RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.c:70

Thanks!

same error, any ideas? PyTorch 0.4.0 on CUDA 8.0

I’m not sure if the Traceback is actually right, as it might point to false methods, if the code wasn’t run with CUDA_LAUNCH_BLOCKING=1.
Could you try to run the code again using this param or if possible using the CPU, since this might give a better error message?

Never mind, I just found I mistakenly combine two datasets with labels 0~9 and 1~10

def __init__(self, input_size, output_size, hidden_size):
        super(Encoder_Decoder, self).__init__()
        self.embed_input = nn.Embedding(input_size, hidden_size, padding_idx=-1)
        self.embed_target = nn.Embedding(output_size, hidden_size, padding_idx=-1)

I fixed

padding_idx=-1 >  padding_idx= 0

and used 0 for padding.