RuntimeError: arguments are located on different GPUs

Hi,
I’ve got this error:

Traceback (most recent call last):
  File "run_Olga.py", line 156, in <module>
    output = model(video,audio)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 468, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/data_parallel.py", line 123, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/data_parallel.py", line 133, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/parallel_apply.py", line 77, in parallel_apply
    raise output
RuntimeError: arguments are located on different GPUs at /home/olga/Downloads/pytorch/aten/src/THC/generated/../generic/THCTensorMathPointwise.cu:313

I tried to fix it following this thread:

In which @ptrblck says it’s a 0.4. bug.
After updating I’m using torch Version : 0.5.0a0+6e28d4d

But it still remains.

Original code is:

from __future__ import division
import time
import logging
import sys
import os
sys.path.insert(0, './drn_MOD')
sys.path.insert(0, './globalnet')
sys.path.insert(0, './Unet')
import torch
import data
import numpy as np
import SoP as SoP
import math
#from tensorboardX import SummaryWriter
import torch.nn as nn

filename = './test_run_bs6'
if not os.path.exists(filename):
    os.makedirs(filename)
"""============================LOG CONFIG & SCORE CONFIG PART================================="""
FORMAT = "[%(filename)s: %(funcName)s] %(message)s"
FORMAT2 = "[%(asctime)-15s %(filename)s:%(lineno)d %(funcName)s] %(message)s"
log_format = logging.Formatter(FORMAT2)
disp_format = logging.Formatter(FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

file_handler = logging.FileHandler(filename+'/log_file.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(log_format)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(disp_format)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self,hist = False):
        self.track_hist = hist
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        if self.track_hist:
            self.hist = []

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count 
        if self.track_hist:
            self.hist.append(val)



"""============================MODEL PARAMETERS CONFIG================================="""



traindir = '../dataset/dic_dataset'
valdir = '../dataset/video'
ade_dir = ['../dataset/ade/ade_binaries','../dataset/ade/ade_binaries2']
BATCH_SIZE = 6
MOMENTUM = 1e-4
WEIGTH_DECAY = 1e-4
EPOCHS = 50
S_EPOCH = 0
STEP_RATIO = 0.1
LR = 0.001
PRINT_FREQ = 10
N= 2
GT_MASK='Binary'
Pretrained = False
CUDA =True
filename = 'test_run_bs6'


"""============================DATABASE, OPTIMIZER AND LOSS================================="""   
if torch.cuda.is_available() == True:
    CUDA=CUDA
else:
    CUDA=False
#Set database
database = data.BinaryData(traindir,data.DRN_transforms(),ade_dir)
iterations = int(math.floor(database.__len__()/(N*BATCH_SIZE)))
#Set dataloader


#Set model
model = SoP.SoP_model(True,cuda=CUDA,n_images=N,GT_MASK=GT_MASK)
optimizer = torch.optim.SGD([{'params': model.unet_model.parameters()},{'params': model.audio_s.parameters()}, {'params': model.drn_model.parameters(), 'lr': 1e-4}], LR,
                            momentum=MOMENTUM,
                            weight_decay=WEIGTH_DECAY)

def init_weights(m):
    if type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('conv2d'))


if Pretrained:
   model.load_state_dict(torch.load('model_dic.pt'))
#    model = torch.load('convergence.pt')
else:
    model.apply(init_weights)

if CUDA:
    model = torch.nn.DataParallel(model).cuda()

# define loss function (criterion) and pptimizer
if CUDA:
    if (GT_MASK == 'Ratio'):
        criterion = torch.nn.L1Loss().cuda()
    if (GT_MASK == 'Binary') :
        criterion = torch.nn.BCELoss().cuda()
#        criterion = torch.nn.BCEWithLogitsLoss()
else:
    if (GT_MASK == 'Ratio') :
        criterion = torch.nn.L1Loss()
    if (GT_MASK == 'Binary') :
        criterion = torch.nn.BCEWithLogitsLoss


batch_time = AverageMeter()
data_time = AverageMeter()
batch_loss = AverageMeter(hist =True)
epoch_loss = AverageMeter(hist = True)
model.train()
end = time.time()

"""============================TRAINING PART================================="""
for t in range(EPOCHS):
    loader = data.DataLoader(database,N,batch_size=BATCH_SIZE,Gt_mask=GT_MASK)
    for j in range(iterations):
        # Forward pass: compute predicted y by passing x to the model. Module objects
        # override the __call__ operator so you can call them like functions. When
        # doing so you pass a Tensor of input data to the Module and it produces
        # a Tensor of output data.
        audio,video,gt = loader()
        data_time.update(time.time() - end)
        if CUDA:
            gt=torch.autograd.Variable(gt.cuda())
            video=torch.autograd.Variable(video.cuda())
#            audio = torch.autograd.Variable(audio)
            audio = torch.autograd.Variable(audio.cuda())

        else:
            gt=torch.autograd.Variable(gt)
            video=torch.autograd.Variable(video)
            audio = torch.autograd.Variable(audio)
        output = model(video,audio)
        if (GT_MASK == 'Ratio') or (GT_MASK == 'Ratio_noL10'):
            loss = criterion(output, gt.float())
        if (GT_MASK == 'Binary') or (GT_MASK == 'BinaryComplex'):
            loss = criterion(output, gt.float())
    
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        batch_loss.update(loss.data.item())
        batch_time.update(time.time() - end)
        end = time.time()
        
        logger.info('Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
        t, j, iterations, batch_time=batch_time,
        data_time=data_time, loss=batch_loss))
        epoch_loss.update(loss.data.item())

torch.save(model.state_dict(), filename+'/state_dic.pt')

Does anyone knows why this happens?

Could you create a smaller runnable code example so that I could try it on my machine?