Very strange "RuntimeError: CUDA error: device-side assert triggered"

`all_hyp, all_scores, all_attn = [], [], []
n_best = self.opt.n_best
all_lengths = []

    for b in range(batch_size):
        scores, ks = beam[b].sortBest()
   
        all_scores += [scores[:n_best]]
        hyps, attn, length = zip(*[beam[b].getHyp(k) for k in ks[:n_best]])
        all_hyp += [hyps]
        print('all_hyp========',all_hyp)  #### WORKS FINE
        all_lengths += [length]
        print('hyps========',hyps)
        print('all_hyp========',all_hyp)
        # if(src_data.data.dim() == 3):
        if self.opt.encoder_type == 'audio':
            valid_attn = decoder_states[0].original_src.narrow(2, 0, 1).squeeze(2)[:, b].ne(onmt.Constants.PAD) \
                .nonzero().squeeze(1)
            print('valid_attn.shape', valid_attn.shape)
        else:
            valid_attn = decoder_states[0].original_src[:, b].ne(onmt.Constants.PAD) \
                .nonzero().squeeze(1)
        attn = [a.index_select(1, valid_attn) for a in attn]
        all_attn += [attn]`
done`

Everything works fine inside the for loop for “all_hyp”, however, when I add one more line to return all_hyp, it gives me the “RuntimeError: CUDA error: device-side assert triggered” message.

FYI, inside the for loop, all_hyp is
all_hyp======== [([tensor(10, device='cuda:0'), tensor(9, device='cuda:0'), tensor(8, device='cuda:0'), tensor(4, device='cuda:0'), tensor(7, device='cuda:0'), tensor(22, device='cuda:0'), tensor(11, device='cuda:0'), tensor(20, device='cuda:0'), tensor(4, device='cuda:0'), tensor(5, device='cuda:0'), tensor(7, device='cuda:0'), tensor(9, device='cuda:0'), tensor(19, device='cuda:0'), tensor(15, device='cuda:0'), tensor(6, device='cuda:0'), tensor(6, device='cuda:0'), tensor(16, device='cuda:0'), tensor(7, device='cuda:0'), tensor(13, device='cuda:0'), tensor(20, device='cuda:0'), tensor(7, device='cuda:0'), tensor(14, device='cuda:0'), tensor(15, device='cuda:0'), tensor(13, device='cuda:0'), tensor(20, device='cuda:0'), tensor(21, device='cuda:0'), tensor(13, device='cuda:0'), tensor(14, device='cuda:0'), tensor(10, device='cuda:0'), tensor(6, device='cuda:0'), tensor(7, device='cuda:0'), tensor(4, device='cuda:0'), tensor(11, device='cuda:0'), tensor(7, device='cuda:0'), tensor(8, device='cuda:0'), tensor(6, device='cuda:0'), tensor(10, device='cuda:0'), tensor(10, device='cuda:0'), tensor(7, device='cuda:0'), tensor(13, device='cuda:0'), tensor(4, device='cuda:0'), tensor(8, device='cuda:0'), tensor(7, device='cuda:0'), tensor(13, device='cuda:0'), tensor(20, device='cuda:0'), tensor(7, device='cuda:0'), tensor(10, device='cuda:0'), tensor(11, device='cuda:0'), tensor(20, device='cuda:0'), tensor(16, device='cuda:0'), tensor(25, device='cuda:0'), tensor(9, device='cuda:0'), tensor(4, device='cuda:0'), tensor(6, device='cuda:0'), tensor(15, device='cuda:0'), tensor(7, device='cuda:0'), tensor(4, device='cuda:0'), tensor(15, device='cuda:0'), tensor(9, device='cuda:0'), tensor(20, device='cuda:0'), tensor(8, device='cuda:0'), tensor(14, device='cuda:0'), tensor(11, device='cuda:0'), tensor(15, device='cuda:0'), tensor(4, device='cuda:0'), tensor(9, device='cuda:0'), tensor(4, device='cuda:0'), tensor(13, device='cuda:0'), tensor(11, device='cuda:0'), tensor(20, device='cuda:0'), tensor(7, device='cuda:0'), tensor(8, device='cuda:0'), tensor(4, device='cuda:0'), tensor(6, device='cuda:0'), tensor(28, device='cuda:0'), tensor(6, device='cuda:0'), tensor(7, device='cuda:0'), tensor(9, device='cuda:0'), tensor(7, device='cuda:0'), tensor(16, device='cuda:0'), tensor(11, device='cuda:0'), tensor(11, device='cuda:0'), tensor(15, device='cuda:0'), tensor(13, device='cuda:0'), tensor(20, device='cuda:0'), tensor(19, device='cuda:0'), tensor(7, device='cuda:0'), tensor(9, device='cuda:0'), tensor(20, device='cuda:0'), tensor(16, device='cuda:0'), tensor(7, device='cuda:0'), tensor(14, device='cuda:0'), tensor(13, device='cuda:0'), tensor(14, device='cuda:0'), tensor(6, device='cuda:0'), tensor(7, device='cuda:0'), tensor(12, device='cuda:0'), tensor(9, device='cuda:0'), tensor(23, device='cuda:0'), tensor(15, device='cuda:0'), tensor(13, device='cuda:0'), tensor(21, device='cuda:0'), tensor(9, device='cuda:0'), tensor(4, device='cuda:0'), tensor(13, device='cuda:0'), tensor(11, device='cuda:0'), tensor(20, device='cuda:0'), tensor(7, device='cuda:0'), tensor(23, device='cuda:0'), tensor(24, device='cuda:0'), tensor(8, device='cuda:0'), tensor(13, device='cuda:0'), tensor(20, device='cuda:0'), tensor(6, device='cuda:0'), tensor(8, device='cuda:0'), tensor(8, device='cuda:0'), tensor(6, device='cuda:0'), tensor(8, device='cuda:0'), tensor(7, device='cuda:0'), tensor(12, device='cuda:0'), tensor(11, device='cuda:0'), tensor(15, device='cuda:0'), tensor(7, device='cuda:0'), tensor(9, device='cuda:0'), tensor(20, device='cuda:0'), tensor(7, device='cuda:0'), tensor(24, device='cuda:0'), tensor(20, device='cuda:0'), tensor(16, device='cuda:0'), tensor(13, device='cuda:0'), tensor(8, device='cuda:0'), tensor(21, device='cuda:0'), tensor(10, device='cuda:0'), tensor(11, device='cuda:0'), tensor(8, device='cuda:0'), tensor(6, device='cuda:0'), tensor(16, device='cuda:0'), tensor(7, device='cuda:0'), tensor(8, device='cuda:0'), tensor(24, device='cuda:0'), tensor(22, device='cuda:0'), tensor(3, device='cuda:0')],)]

Before returning, I printed out some info regarding the list:

`print(‘all_hyp=’,all_hyp) = ==> RuntimeError: CUDA error: device-side assert triggered

print(‘len(all_hyp)=’,len(all_hyp)) ==> 1

print(‘all_hyp[0]=’,all_hyp[0]) ==> RuntimeError: CUDA error: device-side assert triggered

print(‘len(all_hyp[0])=’,len(all_hyp[0])) ==> 1
`
Appreciate if somebody can help on this one. Thanks

Could you rerun your script using

CUDA_LAUNCH_BLOCKING=1 python script.py args

and post the stack trace here?

Thanks for the hint. By running on CPU, I have located the bug. It is completely unrelated to the list.