CUDA device assert triggered in pack_padded_sequence

Adrian_Coutsoftides · May 16, 2019, 6:54am

I move a pretrained embedding model onto the GPU in an effort to make my model faster and I keep getting this device assert trigger, im running the code with CUDA_LAUNCH_BLOCKING= 1 and this is the (rather unhelpful) stacktrace I received, any ideas?

RuntimeError                              Traceback (most recent call last)
<ipython-input-32-f94813b879a4> in <module>
     11     start_time = time.time()
     12 
---> 13     train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
     14     valid_loss = evaluate(model, valid_iterator, criterion)
     15 

<ipython-input-28-f807d4f617eb> in train(model, iterator, optimizer, criterion, clip)
     11         optimizer.optimizer.zero_grad()
     12 
---> 13         output = model(src, trg[:,:-1])
     14         #output = [batch size, trg sent len - 1, output dim]
     15         #trg = [batch size, trg sent len]

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

<ipython-input-19-ee8c0f2d26d1> in forward(self, src, trg)
     31         #trg = [batch size, trg sent len]
     32         src_mask, trg_mask = self.make_masks(src, trg)
---> 33         src_inject = self.embedder.get_embeddings(src,src.size()[1])
     34         src_inject = torch.stack(src_inject).cuda()
     35         enc_src = self.encoder(src, src_inject, src_mask)

~/project/Embedder.py in get_embeddings(self, primary_sequences, d_embed, alphabet, pack_only, compile_only)
     27         if pack_only == True:
     28             return fully_packed_sequences
---> 29         compiled_sequences = self.encoder.embedding(fully_packed_sequences)
     30         if compile_only == True:
     31             return compiled_sequences

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
    119 
    120         if self.lm:
--> 121             h = self.embed(x)
    122         else:
    123             if type(x) is PackedSequence:

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
     23     def forward(self, x):
     24         packed = type(x) is PackedSequence
---> 25         h_lm = self.lm.encode(x)
     26 
     27         # embed and unpack if packed

~/project/src/models/sequence.py in encode(self, x)
    180 
    181     def encode(self, x):
--> 182         z_fwd,z_rvs = self.embed_and_split(x, pad=True)
    183         h_fwd_layers,h_rvs_layers = self.transform(z_fwd, z_rvs)
    184 

~/project/src/models/sequence.py in embed_and_split(self, x, pad)
    156         if packed:
    157             lengths = [s-1 for s in batch_sizes]
--> 158             z_fwd = pack_padded_sequence(z_fwd, lengths, batch_first=True)
    159             z_rvs = pack_padded_sequence(z_rvs, lengths, batch_first=True)
    160         # reverse z_rvs

/opt/conda/lib/python3.6/site-packages/torch/nn/utils/rnn.py in pack_padded_sequence(input, lengths, batch_first)
    146                       category=torch.jit.TracerWarning, stacklevel=2)
    147     lengths = torch.as_tensor(lengths, dtype=torch.int64)
--> 148     return PackedSequence(torch._C._VariableFunctions._pack_padded_sequence(input, lengths, batch_first))
    149 
    150 

RuntimeError: CUDA error: device-side assert triggered

ptrblck · May 16, 2019, 10:59am

Is your code running fine on the CPU?
Sometimes the error messages thrown on the CPU are clearer that the CUDA ones.

Adrian_Coutsoftides · May 16, 2019, 11:05am

No the code does not run onthe CPU,
but this is because it is expecting a CPU too tensor and receives a GPU tensor instead:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-32-f94813b879a4> in <module>
     11     start_time = time.time()
     12 
---> 13     train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
     14     valid_loss = evaluate(model, valid_iterator, criterion)
     15 

<ipython-input-28-f807d4f617eb> in train(model, iterator, optimizer, criterion, clip)
     11         optimizer.optimizer.zero_grad()
     12 
---> 13         output = model(src, trg[:,:-1])
     14         #output = [batch size, trg sent len - 1, output dim]
     15         #trg = [batch size, trg sent len]

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

<ipython-input-19-ee8c0f2d26d1> in forward(self, src, trg)
     31         #trg = [batch size, trg sent len]
     32         src_mask, trg_mask = self.make_masks(src, trg)
---> 33         src_inject = self.embedder.get_embeddings(src,src.size()[1])
     34         src_inject = torch.stack(src_inject).cuda()
     35         enc_src = self.encoder(src, src_inject, src_mask)

~/project/Embedder.py in get_embeddings(self, primary_sequences, d_embed, alphabet, pack_only, compile_only)
     27         if pack_only == True:
     28             return fully_packed_sequences
---> 29         compiled_sequences = self.encoder.embedding(fully_packed_sequences)
     30         if compile_only == True:
     31             return compiled_sequences

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
    119 
    120         if self.lm:
--> 121             h = self.embed(x)
    122         else:
    123             if type(x) is PackedSequence:

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
     23     def forward(self, x):
     24         packed = type(x) is PackedSequence
---> 25         h_lm = self.lm.encode(x)
     26 
     27         # embed and unpack if packed

~/project/src/models/sequence.py in encode(self, x)
    180 
    181     def encode(self, x):
--> 182         z_fwd,z_rvs = self.embed_and_split(x, pad=True)
    183         h_fwd_layers,h_rvs_layers = self.transform(z_fwd, z_rvs)
    184 

~/project/src/models/sequence.py in embed_and_split(self, x, pad)
    149 
    150         # now, encode x as distributed vectors
--> 151         z = self.embed(x)
    152 
    153         # to pass to transform, we discard the last element for z_fwd and the first element for z_rvs

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/sparse.py in forward(self, input)
    116         return F.embedding(
    117             input, self.weight, self.padding_idx, self.max_norm,
--> 118             self.norm_type, self.scale_grad_by_freq, self.sparse)
    119 
    120     def extra_repr(self):

/opt/conda/lib/python3.6/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   1452         # remove once script supports set_grad_enabled
   1453         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1454     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   1455 
   1456 

RuntimeError: Expected object of backend CPU but got backend CUDA for argument #3 'index'

So I loaded the pre-trained model onto the GPU instead:

from Embedder import Embedder
e = Embedder()
e.get_embedder().cuda() #pretrained model loaded onto the GPU here
pad_idx = SRC.vocab.stoi['<pad>']
model = Seq2Seq(enc, dec, pad_idx, device, e).to(device)

ptrblck · May 16, 2019, 11:15am

Try to remove the .cuda() calls and replace them with .to(device), so that you can write device-agnistic code.
Switching device = 'cpu' in your script should yield a CPU-only run.
Once this is done, you might get a better error message for the initial issue.

Adrian_Coutsoftides · May 16, 2019, 11:19am

I get the same error when I replace cuda() with .to(‘cpu’)

ptrblck · May 16, 2019, 11:22am

Then other tensors are still pushed to the GPU.
Check e.g. index and try to replace hardcoded cuda() calls with to(device) calls.

Adrian_Coutsoftides · May 16, 2019, 11:32am

I’ve tried hard coding cpu call throughout the process flow and still get the same error (even when the same tensor is moved to the cpu). Only a single tensor is involved in this process:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-32-f94813b879a4> in <module>
     11     start_time = time.time()
     12 
---> 13     train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
     14     valid_loss = evaluate(model, valid_iterator, criterion)
     15 

<ipython-input-28-f807d4f617eb> in train(model, iterator, optimizer, criterion, clip)
     11         optimizer.optimizer.zero_grad()
     12 
---> 13         output = model(src, trg[:,:-1])
     14         #output = [batch size, trg sent len - 1, output dim]
     15         #trg = [batch size, trg sent len]

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

<ipython-input-19-15e3ff9eb197> in forward(self, src, trg)
     32         src_mask, trg_mask = self.make_masks(src, trg)
     33 
---> 34         src_inject = self.embedder.get_embeddings(src,src.size()[1])
     35         src_inject = torch.stack(src_inject).cuda()
     36         enc_src = self.encoder(src, src_inject, src_mask)

~/project/Embedder.py in get_embeddings(self, primary_sequences, d_embed, alphabet, pack_only, compile_only)
     28             return fully_packed_sequences
     29         fully_packed_sequences.to('cpu') #the input is moved to the CPU
---> 30         compiled_sequences = self.encoder.embedding(fully_packed_sequences)
     31         if compile_only == True:
     32             return compiled_sequences

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
    119 
    120         if self.lm:
--> 121             h = self.embed(x)
    122         else:
    123             if type(x) is PackedSequence:

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

~/project/src/models/embedding.py in forward(self, x)
     23     def forward(self, x):
     24         packed = type(x) is PackedSequence
---> 25         h_lm = self.lm.encode(x)
     26 
     27         # embed and unpack if packed

~/project/src/models/sequence.py in encode(self, x)
    182     def encode(self, x):
    183         x.to('cpu') #again
--> 184         z_fwd,z_rvs = self.embed_and_split(x, pad=True)
    185         h_fwd_layers,h_rvs_layers = self.transform(z_fwd, z_rvs)
    186 

~/project/src/models/sequence.py in embed_and_split(self, x, pad)
    150         # now, encode x as distributed vectors
    151         x.to('cpu') # a third time
--> 152         z = self.embed(x)
    153 
    154         # to pass to transform, we discard the last element for z_fwd and the first element for z_rvs

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/sparse.py in forward(self, input)
    116         return F.embedding(
    117             input, self.weight, self.padding_idx, self.max_norm,
--> 118             self.norm_type, self.scale_grad_by_freq, self.sparse)
    119 
    120     def extra_repr(self):

/opt/conda/lib/python3.6/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   1452         # remove once script supports set_grad_enabled
   1453         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1454     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   1455 
   1456 

RuntimeError: Expected object of backend CPU but got backend CUDA for argument #3 'index'