Tensors assigned to different devices

I’m trying to train a suite of models on Google Colab. My training function looks like this

def train_models(path,progress=gradio.Progress(track_tqdm=True)):
    device = torch.device('cuda:0')
    torch.cuda.empty_cache()
    tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base')
    trainer = qarac.models.QaracTrainerModel.QaracTrainerModel('roberta-base', 
                                                               tokenizer)
    
    trainer.to(device)
    loss_fn = CombinedLoss()
    loss_fn.cuda()
    optimizer = torch.optim.NAdam(trainer.parameters(),lr=5.0e-5)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,gamma=0.9)
    training_data = qarac.corpora.CombinedCorpus.CombinedCorpus(tokenizer,
                                                                all_text='corpora/all_text.csv',
                                                                question_answering='corpora/question_answering.csv',
                                                                reasoning='corpora/reasoning_train.csv',
                                                                consistency='corpora/consistency.csv',
                                                                device=device)
    n_batches = len(training_data)
    history = {}
    for epoch in range(25):
        print("Epoch",epoch)
        epoch_label = 'Epoch {}'.format(epoch)
        epoch_data = {}
        for (batch,(X,Y)) in enumerate(tqdm.tqdm(training_data)):
            prediction = trainer(X['all_text'],
                                 X['offset_text'],
                                 X['question'],
                                 X['answer'],
                                 X['proposition0'],
                                 X['proposition1'],
                                 X['conclusion_offset'],
                                 X['statement0'],
                                 X['statement1'])
            loss = loss_fn(prediction,Y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if batch % 1024 == 0 or batch == n_batches-1:
                epoch_data[batch] = loss.item()
        history[epoch_label] = epoch_data
        scheduler.step()
    huggingface_hub.login(token=userdata.get('HUGGINGFACE_TOKEN'))
    trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
    trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
    trainer.decoder.push_to_hub('{}/qarac-roberta-decoder'.format(path))
    return history

To see the models in more detail, go to QARAC on GitHub

When I try to run it, I get the following error

---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

<ipython-input-4-56b95fe40192> in <cell line: 1>()
----> 1 scripts.train_models('PlayfulTechnology')

9 frames

/content/QARAC/scripts.py in train_models(path, progress)
    143         epoch_data = {}
    144         for (batch,(X,Y)) in enumerate(tqdm.tqdm(training_data)):
--> 145             prediction = trainer(X['all_text'],
    146                                  X['offset_text'],
    147                                  X['question'],

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/content/QARAC/qarac/models/QaracTrainerModel.py in forward(self, all_text, offset_text, question, answer, proposition0, proposition1, conclusion_offset, statement0, statement1)
     88 
     89         """
---> 90         encode_decode = self.decoder((self.answer_encoder(all_text),
     91                                       offset_text))
     92         question_answering = self.question_encoder(question) - self.answer_encoder(answer)

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/content/QARAC/qarac/models/QaracEncoderModel.py in forward(self, input_ids, attention_mask)
     50         if attention_mask is None and 'attention_mask' in input_ids:
     51             (input_ids,attention_mask) = (input_ids['input_ids'],input_ids['attention_mask'])
---> 52         return self.head(self.encoder(input_ids,
     53                                       attention_mask).last_hidden_state,
     54                          attention_mask)

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/content/QARAC/qarac/models/layers/GlobalAttentionPoolingHead.py in forward(self, X, attention_mask)
     58         Xa = X*attention_mask
     59         sigma = torch.sum(Xa,dim=1,keepdim=True)
---> 60         gp = self.global_projection(sigma)
     61         lp = self.local_projection(Xa)
     62 

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/content/QARAC/qarac/models/layers/FactorizedMatrixMultiplication.py in forward(self, X)
     21 
     22     def forward(self,X):
---> 23         return torch.einsum('ij,klj->kli',self.matrix,X)

/usr/local/lib/python3.10/dist-packages/torch/functional.py in einsum(*args)
    328         return einsum(equation, *_operands)
    329 
--> 330     return _VF.einsum(equation, operands)  # type: ignore[attr-defined]
    331 
    332 # Wrapper around _histogramdd and _histogramdd_bin_edges needed due to (Tensor, Tensor[]) return type.

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_bmm)


What do I need to do to ensure everything is assigned to the GPU correctly?

Based on the error message the einsum op fails as one of its inputs is still on the CPU. Make sure all inputs were moved to the GPU before executing the model as I would guess you might have forgotten to move one.