RuntimeError: Input tensors need to be on the same GPU

I want to train a hugging face gpt2 model using the wikitext dataset. I am loading the model the tokenizer applies it to the data and finally, I am training the model using these data. I am using the package transformers-heads and the code in this link which is actually the following:

# GPT2 is the fastest and requires the fewest memory. However, this works just the same with any Llama or Mistral model. Just change model_path to its hugging face path.
  model_path = "gpt2"
  train_epochs = 1
  eval_epochs = 1
  logging_steps = 100

  model_params = get_model_params(model_path)
  model_class = model_params["model_class"]
  hidden_size = model_params["hidden_size"]
  vocab_size = model_params["vocab_size"]
  print(model_params)

  model_params = get_model_params(model_path)
  model_class = model_params["model_class"]
  hidden_size = model_params["hidden_size"]
  vocab_size = model_params["vocab_size"]
  print(model_params)

  heads_configs = [
      HeadConfig(
          name="wikitext_head",
          layer_hook=-4,  # Hook to layer [-4] (Drop 3 layers from the end)
          in_size=hidden_size,
          num_layers=1,
          output_activation="linear",
          is_causal_lm=True,
          loss_fct="cross_entropy",
          num_outputs=vocab_size,
          is_regression=False,
          output_bias=False,
      )
  ]

  dd = load_dataset("wikitext", "wikitext-2-v1")
  #device
  tokenizer = AutoTokenizer.from_pretrained(model_path)

  if tokenizer.pad_token_id is None:
      tokenizer.pad_token = tokenizer.eos_token

  def tokenize_function(examples):
      out = tokenizer(examples["text"], padding=False, truncation=True)
      out[heads_configs[0].name] = out["input_ids"].copy()
      return out.to(device)

  for split in dd.keys():
      dd[split] = dd[split].filter(function=lambda example: len(example["text"]) > 10)
      dd[split] = dd[split].map(tokenize_function, batched=True)

  dd.set_format(type="torch", columns=["input_ids", "attention_mask", heads_configs[0].name])
  
  for split in dd.keys():
      dd[split] = dd[split].remove_columns("text")
  
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      load_in_8bit=False,
      llm_int8_threshold=6.0,
      llm_int8_has_fp16_weight=False,
      bnb_4bit_compute_dtype=torch.float32,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
  )

  model = load_headed(
      model_class,
      model_path,
      head_configs=heads_configs,
      quantization_config=quantization_config,
      device_map=device,
  )
  print_trainable_parameters(model)

 #print(get_top_n_preds(n=5, model=model, text="The historical significance of", tokenizer=tokenizer))
  args = TrainingArguments(
      output_dir="linear_probe_test",
      learning_rate=0.0002,
      num_train_epochs=train_epochs,
      logging_steps=logging_steps,
      do_eval=False,
      remove_unused_columns=False,  # Important to set to False, otherwise things will fail
  )
  collator = DataCollatorWithPadding(
      feature_name_to_padding_value={
          "input_ids": tokenizer.pad_token_id,
          heads_configs[0].name: -100,
          "attention_mask": 0,
      }
  )
  trainer = Trainer(
      model,
      args=args,
      train_dataset=dd["train"],
      data_collator=collator
  )

  pdb.set_trace()
  trainer.train()

  print(evaluate_head_wise(model, dd["validation"], collator, epochs=eval_epochs))
  print(evaluate_head_wise(model, dd["validation"], collator, epochs=eval_epochs))

When running the trainer.train() I am receiving the following error:

RuntimeError: Input tensors need to be on the same GPU, but found the following tensor and device combinations: [(torch.Size([1, 884736]), device(type='cuda', index=1)), (torch.Size([27648]), device(type='cuda', index=0)), (torch.Size([2304, 768]), device(type='cuda', index=1))]

I guess different things are running on different GPUs. Why is that an issue? And how can I locate what is running on different GPUs?