I’m trying to train a suite of models on Google Colab. My training function looks like this
def train_models(path,progress=gradio.Progress(track_tqdm=True)):
device = torch.device('cuda:0')
torch.cuda.empty_cache()
tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base')
trainer = qarac.models.QaracTrainerModel.QaracTrainerModel('roberta-base',
tokenizer)
trainer.to(device)
loss_fn = CombinedLoss()
loss_fn.cuda()
optimizer = torch.optim.NAdam(trainer.parameters(),lr=5.0e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,gamma=0.9)
training_data = qarac.corpora.CombinedCorpus.CombinedCorpus(tokenizer,
all_text='corpora/all_text.csv',
question_answering='corpora/question_answering.csv',
reasoning='corpora/reasoning_train.csv',
consistency='corpora/consistency.csv',
device=device)
n_batches = len(training_data)
history = {}
for epoch in range(25):
print("Epoch",epoch)
epoch_label = 'Epoch {}'.format(epoch)
epoch_data = {}
for (batch,(X,Y)) in enumerate(tqdm.tqdm(training_data)):
prediction = trainer(X['all_text'],
X['offset_text'],
X['question'],
X['answer'],
X['proposition0'],
X['proposition1'],
X['conclusion_offset'],
X['statement0'],
X['statement1'])
loss = loss_fn(prediction,Y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if batch % 1024 == 0 or batch == n_batches-1:
epoch_data[batch] = loss.item()
history[epoch_label] = epoch_data
scheduler.step()
huggingface_hub.login(token=userdata.get('HUGGINGFACE_TOKEN'))
trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
trainer.decoder.push_to_hub('{}/qarac-roberta-decoder'.format(path))
return history
To see the models in more detail, go to QARAC on GitHub
When I try to run it, I get the following error
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-4-56b95fe40192> in <cell line: 1>()
----> 1 scripts.train_models('PlayfulTechnology')
9 frames
/content/QARAC/scripts.py in train_models(path, progress)
143 epoch_data = {}
144 for (batch,(X,Y)) in enumerate(tqdm.tqdm(training_data)):
--> 145 prediction = trainer(X['all_text'],
146 X['offset_text'],
147 X['question'],
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
/content/QARAC/qarac/models/QaracTrainerModel.py in forward(self, all_text, offset_text, question, answer, proposition0, proposition1, conclusion_offset, statement0, statement1)
88
89 """
---> 90 encode_decode = self.decoder((self.answer_encoder(all_text),
91 offset_text))
92 question_answering = self.question_encoder(question) - self.answer_encoder(answer)
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
/content/QARAC/qarac/models/QaracEncoderModel.py in forward(self, input_ids, attention_mask)
50 if attention_mask is None and 'attention_mask' in input_ids:
51 (input_ids,attention_mask) = (input_ids['input_ids'],input_ids['attention_mask'])
---> 52 return self.head(self.encoder(input_ids,
53 attention_mask).last_hidden_state,
54 attention_mask)
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
/content/QARAC/qarac/models/layers/GlobalAttentionPoolingHead.py in forward(self, X, attention_mask)
58 Xa = X*attention_mask
59 sigma = torch.sum(Xa,dim=1,keepdim=True)
---> 60 gp = self.global_projection(sigma)
61 lp = self.local_projection(Xa)
62
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
1112 full_backward_hooks, non_full_backward_hooks = [], []
/content/QARAC/qarac/models/layers/FactorizedMatrixMultiplication.py in forward(self, X)
21
22 def forward(self,X):
---> 23 return torch.einsum('ij,klj->kli',self.matrix,X)
/usr/local/lib/python3.10/dist-packages/torch/functional.py in einsum(*args)
328 return einsum(equation, *_operands)
329
--> 330 return _VF.einsum(equation, operands) # type: ignore[attr-defined]
331
332 # Wrapper around _histogramdd and _histogramdd_bin_edges needed due to (Tensor, Tensor[]) return type.
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_bmm)
What do I need to do to ensure everything is assigned to the GPU correctly?