Runtime Error: cuda runtime error

for the following code:

 class_weights = xview_class_weights_hard_mining(range(60)).to(device)
    class_weights2 = xview_class_weights_hard_mining(range(60)).to(device2)

    for epoch in range(opt.epochs):
        epoch += start_epoch

        # img_size = random.choice([19, 20, 21, 22, 23, 24, 25]) * 32
        # dataloader = ListDataset(train_path, batch_size=opt.batch_size, img_size=img_size, targets_path=targets_path)
        # print('Running image size %g' % img_size)

        # Update scheduler
        # if epoch % 25 == 0:
        #     scheduler.last_epoch = -1  # for cosine annealing, restart every 25 epochs
        # scheduler.step()
        # if epoch <= 100:
        # for g in optimizer.param_groups:
        # g['lr'] = 0.0005 * (0.992 ** epoch)  # 1/10 th every 250 epochs
        # g['lr'] = 0.001 * (0.9773 ** epoch)  # 1/10 th every 100 epochs
        # g['lr'] = 0.0005 * (0.955 ** epoch)  # 1/10 th every 50 epochs
        # g['lr'] = 0.0005 * (0.926 ** epoch)  # 1/10 th every 30 epochs

        ui = -1
        rloss = defaultdict(float)  # running loss
        metrics = torch.zeros(4, 60)
        for i, (imgs, targets) in enumerate(dataloader):
            n = 4  # number of pictures at a time
            for j in range(int(len(imgs) / n)):
                targets_j = targets[j * n:j * n + n]
                nGT = sum([len(x) for x in targets_j])
                if nGT < 1:
                    continue

               # print(torch.cuda.memory_allocated())
                loss = model(imgs[j * n:j * n + n].to(device), targets_j, requestPrecision=True,
                             weight=class_weights, epoch=epoch)

                loss2 = model2(imgs[j * n:j * n + n].to(device2), targets_j, requestPrecision=True,
                             weight=class_weights2, epoch=epoch)

I am getting the following error:

Traceback (most recent call last):
  File "traingpu.py", line 234, in <module>
    main(opt)
  File "traingpu.py", line 148, in main
    weight=class_weights2, epoch=epoch)
  File "/sdcc/u/amalik/.local/lib/python3.5/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)
  File "/gpfshome01/u/amalik/OGA/Yolov3/xview-yolov3/models.py", line 234, in forward
    x, *losses = module[0](x, targets, requestPrecision, weight, epoch)
  File "/sdcc/u/amalik/.local/lib/python3.5/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)
  File "/gpfshome01/u/amalik/OGA/Yolov3/xview-yolov3/models.py", line 128, in forward
    width = ((w.data * 2) ** 2) * self.anchor_w
RuntimeError: arguments are located on different GPUs at /pytorch/aten/src/THC/generated/../generic/THCTensorMathPointwise.cu:314

I have two GPUs. I want to run two independent models on these two GPUs.

I think the targets are not sent to two different GPUs correctly. Same targets are used for both models.