GPU integer multiplication overflow

Hi I have a simple model below that runs on cpu and returns the output as expected. The moment I set this model to CUDA I get the interger multiplication overflow issue. This model only embeds a set of integers so it’s not clear to me where this is coming from.

import torch
import numpy

device = (torch.device('cuda') if torch.cuda.is_available()
else torch.device('cpu'))

class SmallModel(torch.nn.Module):
    def __init__(self, skill_num, emb_size, max_seq_length):
        super().__init__()
       
        self.emb_size=emb_size
        self.skill_num=skill_num
        self.max_seq_length=max_seq_length
       
        self.skill_embeddings=torch.nn.Embedding(self.skill_num, self.emb_size)
        self.inter_embeddings=torch.nn.Embedding(self.skill_num*2,self.emb_size)
        self.embd_pos = torch.nn.Embedding(self.max_seq_length , self.emb_size)

         
    def forward(self, x, y):
        query = self.skill_embeddings(x) # shape bs X seq_len X emb_size 
#         mask_labels = y * (y > -1).long()
#         #mask_labels=mask_labels.to(device)
#         key = self.inter_embeddings(x+mask_labels*self.skill_num)
#         values = self.inter_embeddings(x+mask_labels*self.skill_num)
#         pos = self.embd_pos(torch.arange(x.shape[1]))
#         key = key+pos 
#         query = query+pos 
        
        return query

# create some mock data, 5 students with 10 seq length
input=abs(torch.ceil(torch.randn(5,10)*100)).type(torch.int)
output=torch.zeros(5,10).type(torch.int) 
output[1,3]=1
output[2,5]=1
output[4,9]=-1

# set model params
skill_num=int(torch.max(input).numpy())+1
emb_size=12
max_seq_length=10

# init model
test_mod=SmallModel(skill_num=skill_num, emb_size=emb_size, max_seq_length=max_seq_length).to(device)

# run 
#query=test_mod(input, output)  # with cpu
query=test_mod(input.to(device), output.to(device)) # with gpu
query

Error message

RuntimeError                              Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

/opt/conda/lib/python3.7/site-packages/IPython/lib/pretty.py in pretty(self, obj)
    392                         if cls is not object \
    393                                 and callable(cls.__dict__.get('__repr__')):
--> 394                             return _repr_pprint(obj, self, cycle)
    395 
    396             return _default_pprint(obj, self, cycle)

/opt/conda/lib/python3.7/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    698     """A pprint that just redirects to the normal repr function."""
    699     # Find newlines and replace them with p.break_()
--> 700     output = repr(obj)
    701     lines = output.splitlines()
    702     with p.group():

/opt/conda/lib/python3.7/site-packages/torch/_tensor.py in __repr__(self, tensor_contents)
    425             )
    426         # All strings are unicode in Python 3.
--> 427         return torch._tensor_str._str(self, tensor_contents=tensor_contents)
    428 
    429     def backward(

/opt/conda/lib/python3.7/site-packages/torch/_tensor_str.py in _str(self, tensor_contents)
    635     with torch.no_grad():
    636         guard = torch._C._DisableFuncTorch()
--> 637         return _str_intern(self, tensor_contents=tensor_contents)

/opt/conda/lib/python3.7/site-packages/torch/_tensor_str.py in _str_intern(inp, tensor_contents)
    566                         tensor_str = _tensor_str(self.to_dense(), indent)
    567                     else:
--> 568                         tensor_str = _tensor_str(self, indent)
    569 
    570     if self.layout != torch.strided:

/opt/conda/lib/python3.7/site-packages/torch/_tensor_str.py in _tensor_str(self, indent)
    326         )
    327     else:
--> 328         formatter = _Formatter(get_summarized_data(self) if summarize else self)
    329         return _tensor_str_with_formatter(self, indent, summarize, formatter)
    330 

/opt/conda/lib/python3.7/site-packages/torch/_tensor_str.py in __init__(self, tensor)
    114         else:
    115             nonzero_finite_vals = torch.masked_select(
--> 116                 tensor_view, torch.isfinite(tensor_view) & tensor_view.ne(0)
    117             )
    118 

RuntimeError: numel: integer multiplication overflow


Could you check the number of elements in your input and if this would overflow int32?
If so, you might need to use long as the dtype.

Thank you for the response.

torch.numel(input) gives 50, which I think is what I expect by setting dim to (5,10). I tried converting input into long tensor as below. Still get the same error.

input=input.type(torch.LongTensor)

And to be clear this only happens on cuda, works fine on cpu.

I cannot reproduce the issue using your code on my GPU with 2.1.0.dev20230605+cu121 and based on the provided shapes the error should never be raised.

Good to know. I wonder if it’s the version issue.

image

I doubt it, as I also cannot reproduce it in 1.13.1+cu117.

Ok this is an extremely bizzare thing. I reduced it all to bare bones, and made the input completely explicit. The embedding layer outside of the model returns what it should. The model itself runs and if I call query.shape I get the right shape output. The moment I call query to get the output I get the same int overflow error. This is on Vertex AI, python version 3.7.12.

Is this possibly because output (query) is on gpu and calling it results in an error? I don’t have a clue what else could be going on on something this simple.

import torch
import numpy

device='cuda'

class SmallModel(torch.nn.Module):
    def __init__(self, skill_num, emb_size):
        super().__init__()
       
        self.emb_size=emb_size
        self.skill_num=skill_num
        self.skill_embeddings=torch.nn.Embedding(self.skill_num, self.emb_size)
                 
    def forward(self, x):
        query = self.skill_embeddings(x) 
        
        return query

inp=torch.tensor([[1,2,3,4],[4,5,2,3]])
test_mod=SmallModel(skill_num=6, emb_size=12).to(device)

out=test_mod(inp.to(device)) 
out.shape

Follow up
out.to('cpu') gets rid of the error

I don’t know what Vertex AI is, but I’m just printing the CUDATensor and get the valid result.
For the sake of completeness: the error is raised here and I don’t know how it can fail in your setup.

Thanks for the help. Vertex AI is just google’s cloud computing platfrom. To close this out, I restarted the kernel and added this line before setting the device and the problem seems to be gone.


os.environ['CUDA_LAUNCH_BLOCKING'] = "1"