Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_mm

Kushagra_Bhatia · June 25, 2020, 4:15pm

This is my model. Only showing the __ init __() and forward() functions

class BERTplusAoA(nn.Module):
    def __init__(self, config, options): 
        super(BERTplusAoA, self).__init__()
        self.bert = BertModel.from_pretrained(
        options.model_name_or_path,
        from_tf=bool(".ckpt" in options.model_name_or_path),
        config=config,
        cache_dir=options.cache_dir if options.cache_dir else None,
        ) 
        self.l0 = nn.Linear(1024, 2)
        self.lq = nn.Linear(1024,256)
        self.lc = nn.Linear(1024,256)
        self.soft=nn.Softmax(dim=-1)
        self.wb = torch.sigmoid(nn.Parameter(torch.zeros(1)))
        self.we = torch.sigmoid(nn.Parameter(torch.zeros(1)))

def forward(self, input_ids, attention_mask, token_type_ids,start_positions,end_positions):
        sequence_out, _ = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # (batch_size,num_tokens,1024)
        logits = self.l0(sequence_out)
        # (batch_size,num_tokens,2)
        start_logits,end_logits=logits.split(1,dim=-1)
        # (batch_size,num_tokens,1)        # (batch_size,num_tokens,1)
        start_logits=start_logits.squeeze(-1)
        end_logits=end_logits.squeeze(-1)
        # (batch_size,num_tokens)        # (batch_size,num_tokens)
        
        lb = F.softmax(start_logits,dim=-1)
        le = F.softmax(end_logits,dim=-1)

        hq,hc=self.splitting_(sequence_out,token_type_ids)
        #return hq,hc,sequence_out
        # hq, hc are list of tensors.

        hq,hq_len=self._batchify(hq,include_lengths=True)
        hc,hc_len=self._batchify(hc,include_lengths=True)
        # hq, hc have are tensors of dim: # (batch_size, max_length q or c, 1024)

        Hq = self.lq(hq)
        Hq_b,Hq_e = Hq.split(128,dim=-1)

        Hc = self.lc(hc)
        Hc_b,Hc_e = Hc.split(128,dim=-1)
..

I get the following error:

Epoch:   0% 0/20 [00:00<?, ?it/s]
Iteration:   0% 0/6464 [00:00<?, ?it/s]Traceback (most recent call last):
  File "run_techqa.py", line 623, in <module>
    main()
  File "run_techqa.py", line 617, in main
    model = train(args, train_dataset, model, optimizer, tokenizer, model_evaluator)
  File "run_techqa.py", line 223, in train
    outputs = model(**inputs)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/apex/amp/_initialize.py", line 197, in new_fwd
    **applier(kwargs, input_caster))
  File "/content/MyDrive/IBM/TechQA-Base/techqa-master/model_techqa.py", line 109, in forward
    Hq = self.lq(hq)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/linear.py", line 87, in forward
    return F.linear(input, self.weight, self.bias)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py", line 1372, in linear
    output = input.matmul(weight.t())
RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_mm

harsha_g · June 25, 2020, 4:19pm

Did you try sending input to the GPU?

SANTOSH_S · June 25, 2020, 4:22pm

Hi @Kushagra_Bhatia,
The issue is with how you pass your inputs to the model during training. Can you post part of you code where you are training?

Kushagra_Bhatia · June 25, 2020, 4:38pm

Hello, @harsha_g and @SANTOSH_S. I believe the dataset is on CUDA.
Training loop is shown below. Also, args.device=device(type=‘cuda’)

   for _ in train_iterator:
        # train_iterator is basically range(0, total_epochs). However in case we resume training then
        # train_iterator becomes range(completed_epochs, total_epochs)
        # In every epoch we compute the mini-batches again
        train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         
        for step, batch in enumerate(epoch_iterator):
            # batch is an element in dataloader. i.e. effectively a mini-batch

            # Skip past any already trained steps if resuming training. i.e. number of mini-batches trained already in the epoch
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in ["roberta"]:
                del inputs["token_type_ids"] 
            # roberta doesn't require token_type_ids           

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)

harsha_g · June 25, 2020, 4:48pm

Then, I am assuming you also put your model on the GPU. Didn’t you?

Kushagra_Bhatia · June 25, 2020, 4:51pm

Yes I did put it on GPU.

def load_model(args, model_class, config):

    model = BERTplusAoA(config, args)
    

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)
...

harsha_g · June 25, 2020, 5:08pm

I am not entirely sure but could it be that you have another model BertModel inside your actual model, and does that need to be put on the device?

Kushagra_Bhatia · June 25, 2020, 5:10pm

I imported it from transformers library.

from transformers import BertModel

harsha_g · June 25, 2020, 5:15pm

I get that. I am just speculating if the following calls will help. I am not entirely sure. I’d just try those.

BertModel = BertModel.to(device)

or

self.bert = BertModel.to(device).from_pretrained( options.model_name_or_path, from_tf=bool(".ckpt" in options.model_name_or_path), config=config, cache_dir=options.cache_dir if options.cache_dir else None, )

Kushagra_Bhatia · June 25, 2020, 5:26pm

No, it doesn’t resolve the error. Rather throws another one:

  BertModel.to(options.device)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 425, in to
    return self._apply(convert)
AttributeError: 'torch.device' object has no attribute '_apply'

I don’t think we are supposed to pass the BertModel to the device. For example, the following code snippet is from huggingface’s official implementation of BertForQuestionAnswering model.

class BertForQuestionAnswering(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

harsha_g · June 25, 2020, 5:39pm

Got it. Thanks for trying though.

harsha_g · June 25, 2020, 5:53pm

@Kushagra_Bhatia Another keen look at the traceback and another thought. It’s evident the issue is at the line. How about sending the device argument to the model and then putting hq and later hc on the device? I guess I was little sloppy earlier.

Kushagra_Bhatia · June 25, 2020, 7:30pm

That actually solved my error. However, I am getting a new error now, which I am not able to resolve. It is as follows:

  File "run_techqa.py", line 623, in <module>
    main()
  File "run_techqa.py", line 617, in main
    model = train(args, train_dataset, model, optimizer, tokenizer, model_evaluator)
  File "run_techqa.py", line 223, in train
    outputs = model(**inputs)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/apex/amp/_initialize.py", line 197, in new_fwd
    **applier(kwargs, input_caster))
  File "/content/MyDrive/IBM/TechQA-Base/techqa-master/model_techqa.py", line 113, in forward
    Hq = self.lq(hq)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/linear.py", line 87, in forward
    return F.linear(input, self.weight, self.bias)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py", line 1372, in linear
    output = input.matmul(weight.t())
RuntimeError: Expected object of scalar type Float but got scalar type Half for argument #2 'mat2' in call to _th_mm

I had used two functions in the forward part which I have not specified previously, these are:

   def splitting_(self,ids,type_ids):
      ans=[]
      for i,lis in enumerate(ids):
        typei=type_ids[i]
        ends=[]
        for j in range(len(typei)-1):
          if typei[j]!=typei[j+1]:
            ends.append(j+1)
        if len(ends)==1:
          qend=ends[0]
          dend=len(typei)
        else:
          qend,dend=ends[0],ends[1]
        qtns=lis[:qend]
        dtns=lis[qend:dend]
        temp=[qtns, dtns]
        ans.append(temp)
      q=[i[0] for i in ans]
      d=[i[1] for i in ans]
      return q,d

    # will take in list of n tensors and output a tensor of dimension n.
    # the list will contain tensors of sentence encoded using tokenizer.
    def _batchify(self,data, align_right=False, include_lengths=False):
      lengths = [x.size(0) for x in data]
      max_length = max(lengths)
      tens=torch.rand(1024,dtype=torch.float32)
      out=torch.stack([torch.stack([tens for _ in range(max_length)]) for _ in range(len(data))])
      for i in range(len(data)):
        data_length = data[i].size(0)
        offset = max_length - data_length if align_right else 0
        out[i].narrow(0, offset, data_length).copy_(data[i])
      if include_lengths:
          return out, lengths
      else:
          return out

Would really appreciate your help!

Note:
I have already tried doing hq=hq.float() and hq=torch.tensor(hq,dtype=torch.float32)

harsha_g · June 25, 2020, 8:04pm

@Kushagra_Bhatia Glad, it worked. Now, onto the next one. Putting the two errors in context, it can be seen something’s off with the weights (self.wb and self.we) now and not hq. Try changing the types of the weights. I am sure that will take care of it. Let us know how it goes.

Kushagra_Bhatia · June 25, 2020, 8:17pm

I don’t exactly understand, the traceback doesn’t explicitly mention self.wb and self.we. How do you infer about the two weights? They are defined as self.wb = torch.sigmoid(nn.Parameter(torch.zeros(1))). Actually I wanted weights to combine two vectors and I have used them like: ans_beg = torch.mul(self.wb,lb)+torch.mul(1-self.wb,s1). Basically it is just a weight between 0 and 1 that I wanted to define that can be learned by the model so that the two vectors could be effectively averaged. What do you think I need to change in the definition of wb and we?

harsha_g · June 25, 2020, 9:53pm

Lecture time

File "/content/MyDrive/IBM/TechQA-Base/techqa-master/model_techqa.py", line 113, in forward
   Hq = self.lq(hq)
 File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
   result = self.forward(*input, **kwargs)
 File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/linear.py", line 87, in forward
   return F.linear(input, self.weight, self.bias)
 File "/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py", line 1372, in linear
   output = input.matmul(weight.t())
RuntimeError: Expected object of scalar type Float but got scalar type Half for argument #2 'mat2' in call to _th_mm

If you carefully see the traceback, you will realize that the error happened at line Hq = self.lq(hq). What’s happening there? You are making a call to the forward (the next line) of that model lq. What does it have? An F.Linear layer. And what happens in that layer? input.matmul(weight.t()). So, if you traceback, the input to the model lq is hq and the weight is? You guessed it right. torch.sigmoid(nn.Parameter(torch.zeros(1))).

Now, the first error you posted said RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_mm. This means there’s an mm (matrix multiplication), happening on the device, which takes two inputs, the first one being input and the second one being weight.t(). So, the earlier error was solved by sending input aka ‘hq’ to the device. Likewise, now the second error will be taken care of by setting the datatype of weight.t() aka self.wb. How? By specifying .float() or .double() when you call torch.zeros().

I know I wasn’t extremely clear but you get the point.

Kushagra_Bhatia · June 26, 2020, 6:14am

The F.linear comes from the nn.Linear from the torch.nn module. Inside the nn.Linear layer the code is as follows (official documentation from https://pytorch.org/docs/master/_modules/torch/nn/modules/linear.html#Linear):

class Linear(Module):

    __constants__ = ['in_features', 'out_features']
    in_features: int
    out_features: int
    weight: Tensor

    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input: Tensor) -> Tensor:
        return F.linear(input, self.weight, self.bias)

As you can see the line

self.weight = Parameter(torch.Tensor(out_features, in_features)) defines the weight

defines the weight that is passed to F.linear which performs the output = input.matmul(weight.t()) operation.

Please correct me if I am wrong.

harsha_g · June 26, 2020, 6:26am

Right on the money.

So, did you manage to solve your latest error?

harsha_g · June 26, 2020, 6:32am

Ah, I see. I got that part wrong. I misread Sigmoid for Linear. But you got it right.

Did you try double or float64 instead?

Kushagra_Bhatia · June 26, 2020, 9:22am

if I do hq=hq.double() or hq=torch.tensor(hq,dtype=torch.float64) before moving hq to the device I get the error:

RuntimeError: Expected object of scalar type Float but got scalar type Half for argument #2 'mat2' in call to _th_mm

If I first move to device then do hq=torch.tensor(hq,dtype=torch.float64) I get:

Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_mm