CUDA device assert triggered in pack_padded_sequence

I move a pretrained embedding model onto the GPU in an effort to make my model faster and I keep getting this device assert trigger, im running the code with CUDA_LAUNCH_BLOCKING= 1 and this is the (rather unhelpful) stacktrace I received, any ideas?

RuntimeError                              Traceback (most recent call last)
<ipython-input-32-f94813b879a4> in <module>
     11     start_time = time.time()
     12 
---> 13     train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
     14     valid_loss = evaluate(model, valid_iterator, criterion)
     15 

<ipython-input-28-f807d4f617eb> in train(model, iterator, optimizer, criterion, clip)
     11         optimizer.optimizer.zero_grad()
     12 
---> 13         output = model(src, trg[:,:-1])
     14         #output = [batch size, trg sent len - 1, output dim]
     15         #trg = [batch size, trg sent len]

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

<ipython-input-19-ee8c0f2d26d1> in forward(self, src, trg)
     31         #trg = [batch size, trg sent len]
     32         src_mask, trg_mask = self.make_masks(src, trg)
---> 33         src_inject = self.embedder.get_embeddings(src,src.size()[1])
     34         src_inject = torch.stack(src_inject).cuda()
     35         enc_src = self.encoder(src, src_inject, src_mask)

~/project/Embedder.py in get_embeddings(self, primary_sequences, d_embed, alphabet, pack_only, compile_only)
     27         if pack_only == True:
     28             return fully_packed_sequences
---> 29         compiled_sequences = self.encoder.embedding(fully_packed_sequences)
     30         if compile_only == True:
     31             return compiled_sequences

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
    119 
    120         if self.lm:
--> 121             h = self.embed(x)
    122         else:
    123             if type(x) is PackedSequence:

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
     23     def forward(self, x):
     24         packed = type(x) is PackedSequence
---> 25         h_lm = self.lm.encode(x)
     26 
     27         # embed and unpack if packed

~/project/src/models/sequence.py in encode(self, x)
    180 
    181     def encode(self, x):
--> 182         z_fwd,z_rvs = self.embed_and_split(x, pad=True)
    183         h_fwd_layers,h_rvs_layers = self.transform(z_fwd, z_rvs)
    184 

~/project/src/models/sequence.py in embed_and_split(self, x, pad)
    156         if packed:
    157             lengths = [s-1 for s in batch_sizes]
--> 158             z_fwd = pack_padded_sequence(z_fwd, lengths, batch_first=True)
    159             z_rvs = pack_padded_sequence(z_rvs, lengths, batch_first=True)
    160         # reverse z_rvs

/opt/conda/lib/python3.6/site-packages/torch/nn/utils/rnn.py in pack_padded_sequence(input, lengths, batch_first)
    146                       category=torch.jit.TracerWarning, stacklevel=2)
    147     lengths = torch.as_tensor(lengths, dtype=torch.int64)
--> 148     return PackedSequence(torch._C._VariableFunctions._pack_padded_sequence(input, lengths, batch_first))
    149 
    150 

RuntimeError: CUDA error: device-side assert triggered

Is your code running fine on the CPU?
Sometimes the error messages thrown on the CPU are clearer that the CUDA ones.

No the code does not run onthe CPU,
but this is because it is expecting a CPU too tensor and receives a GPU tensor instead:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-32-f94813b879a4> in <module>
     11     start_time = time.time()
     12 
---> 13     train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
     14     valid_loss = evaluate(model, valid_iterator, criterion)
     15 

<ipython-input-28-f807d4f617eb> in train(model, iterator, optimizer, criterion, clip)
     11         optimizer.optimizer.zero_grad()
     12 
---> 13         output = model(src, trg[:,:-1])
     14         #output = [batch size, trg sent len - 1, output dim]
     15         #trg = [batch size, trg sent len]

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

<ipython-input-19-ee8c0f2d26d1> in forward(self, src, trg)
     31         #trg = [batch size, trg sent len]
     32         src_mask, trg_mask = self.make_masks(src, trg)
---> 33         src_inject = self.embedder.get_embeddings(src,src.size()[1])
     34         src_inject = torch.stack(src_inject).cuda()
     35         enc_src = self.encoder(src, src_inject, src_mask)

~/project/Embedder.py in get_embeddings(self, primary_sequences, d_embed, alphabet, pack_only, compile_only)
     27         if pack_only == True:
     28             return fully_packed_sequences
---> 29         compiled_sequences = self.encoder.embedding(fully_packed_sequences)
     30         if compile_only == True:
     31             return compiled_sequences

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
    119 
    120         if self.lm:
--> 121             h = self.embed(x)
    122         else:
    123             if type(x) is PackedSequence:

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
     23     def forward(self, x):
     24         packed = type(x) is PackedSequence
---> 25         h_lm = self.lm.encode(x)
     26 
     27         # embed and unpack if packed

~/project/src/models/sequence.py in encode(self, x)
    180 
    181     def encode(self, x):
--> 182         z_fwd,z_rvs = self.embed_and_split(x, pad=True)
    183         h_fwd_layers,h_rvs_layers = self.transform(z_fwd, z_rvs)
    184 

~/project/src/models/sequence.py in embed_and_split(self, x, pad)
    149 
    150         # now, encode x as distributed vectors
--> 151         z = self.embed(x)
    152 
    153         # to pass to transform, we discard the last element for z_fwd and the first element for z_rvs

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/sparse.py in forward(self, input)
    116         return F.embedding(
    117             input, self.weight, self.padding_idx, self.max_norm,
--> 118             self.norm_type, self.scale_grad_by_freq, self.sparse)
    119 
    120     def extra_repr(self):

/opt/conda/lib/python3.6/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   1452         # remove once script supports set_grad_enabled
   1453         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1454     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   1455 
   1456 

RuntimeError: Expected object of backend CPU but got backend CUDA for argument #3 'index'

So I loaded the pre-trained model onto the GPU instead:

from Embedder import Embedder
e = Embedder()
e.get_embedder().cuda() #pretrained model loaded onto the GPU here
pad_idx = SRC.vocab.stoi['<pad>']
model = Seq2Seq(enc, dec, pad_idx, device, e).to(device)

Try to remove the .cuda() calls and replace them with .to(device), so that you can write device-agnistic code.
Switching device = 'cpu' in your script should yield a CPU-only run.
Once this is done, you might get a better error message for the initial issue.

I get the same error when I replace cuda() with .to(‘cpu’)

Then other tensors are still pushed to the GPU.
Check e.g. index and try to replace hardcoded cuda() calls with to(device) calls.

I’ve tried hard coding cpu call throughout the process flow and still get the same error (even when the same tensor is moved to the cpu). Only a single tensor is involved in this process:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-32-f94813b879a4> in <module>
     11     start_time = time.time()
     12 
---> 13     train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
     14     valid_loss = evaluate(model, valid_iterator, criterion)
     15 

<ipython-input-28-f807d4f617eb> in train(model, iterator, optimizer, criterion, clip)
     11         optimizer.optimizer.zero_grad()
     12 
---> 13         output = model(src, trg[:,:-1])
     14         #output = [batch size, trg sent len - 1, output dim]
     15         #trg = [batch size, trg sent len]

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

<ipython-input-19-15e3ff9eb197> in forward(self, src, trg)
     32         src_mask, trg_mask = self.make_masks(src, trg)
     33 
---> 34         src_inject = self.embedder.get_embeddings(src,src.size()[1])
     35         src_inject = torch.stack(src_inject).cuda()
     36         enc_src = self.encoder(src, src_inject, src_mask)

~/project/Embedder.py in get_embeddings(self, primary_sequences, d_embed, alphabet, pack_only, compile_only)
     28             return fully_packed_sequences
     29         fully_packed_sequences.to('cpu') #the input is moved to the CPU
---> 30         compiled_sequences = self.encoder.embedding(fully_packed_sequences)
     31         if compile_only == True:
     32             return compiled_sequences

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
    119 
    120         if self.lm:
--> 121             h = self.embed(x)
    122         else:
    123             if type(x) is PackedSequence:

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
     23     def forward(self, x):
     24         packed = type(x) is PackedSequence
---> 25         h_lm = self.lm.encode(x)
     26 
     27         # embed and unpack if packed

~/project/src/models/sequence.py in encode(self, x)
    182     def encode(self, x):
    183         x.to('cpu') #again
--> 184         z_fwd,z_rvs = self.embed_and_split(x, pad=True)
    185         h_fwd_layers,h_rvs_layers = self.transform(z_fwd, z_rvs)
    186 

~/project/src/models/sequence.py in embed_and_split(self, x, pad)
    150         # now, encode x as distributed vectors
    151         x.to('cpu') # a third time
--> 152         z = self.embed(x)
    153 
    154         # to pass to transform, we discard the last element for z_fwd and the first element for z_rvs

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/sparse.py in forward(self, input)
    116         return F.embedding(
    117             input, self.weight, self.padding_idx, self.max_norm,
--> 118             self.norm_type, self.scale_grad_by_freq, self.sparse)
    119 
    120     def extra_repr(self):

/opt/conda/lib/python3.6/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   1452         # remove once script supports set_grad_enabled
   1453         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1454     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   1455 
   1456 

RuntimeError: Expected object of backend CPU but got backend CUDA for argument #3 'index'