Hi,
I’ve got this error:
Traceback (most recent call last):
File "run_Olga.py", line 156, in <module>
output = model(video,audio)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 468, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/data_parallel.py", line 123, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/data_parallel.py", line 133, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/usr/local/lib/python2.7/dist-packages/torch/nn/parallel/parallel_apply.py", line 77, in parallel_apply
raise output
RuntimeError: arguments are located on different GPUs at /home/olga/Downloads/pytorch/aten/src/THC/generated/../generic/THCTensorMathPointwise.cu:313
I tried to fix it following this thread:
In which @ptrblck says it’s a 0.4. bug.
After updating I’m using torch Version : 0.5.0a0+6e28d4d
But it still remains.
Original code is:
from __future__ import division
import time
import logging
import sys
import os
sys.path.insert(0, './drn_MOD')
sys.path.insert(0, './globalnet')
sys.path.insert(0, './Unet')
import torch
import data
import numpy as np
import SoP as SoP
import math
#from tensorboardX import SummaryWriter
import torch.nn as nn
filename = './test_run_bs6'
if not os.path.exists(filename):
os.makedirs(filename)
"""============================LOG CONFIG & SCORE CONFIG PART================================="""
FORMAT = "[%(filename)s: %(funcName)s] %(message)s"
FORMAT2 = "[%(asctime)-15s %(filename)s:%(lineno)d %(funcName)s] %(message)s"
log_format = logging.Formatter(FORMAT2)
disp_format = logging.Formatter(FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler(filename+'/log_file.log')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(log_format)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(disp_format)
logger.addHandler(file_handler)
logger.addHandler(stream_handler)
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self,hist = False):
self.track_hist = hist
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
if self.track_hist:
self.hist = []
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
if self.track_hist:
self.hist.append(val)
"""============================MODEL PARAMETERS CONFIG================================="""
traindir = '../dataset/dic_dataset'
valdir = '../dataset/video'
ade_dir = ['../dataset/ade/ade_binaries','../dataset/ade/ade_binaries2']
BATCH_SIZE = 6
MOMENTUM = 1e-4
WEIGTH_DECAY = 1e-4
EPOCHS = 50
S_EPOCH = 0
STEP_RATIO = 0.1
LR = 0.001
PRINT_FREQ = 10
N= 2
GT_MASK='Binary'
Pretrained = False
CUDA =True
filename = 'test_run_bs6'
"""============================DATABASE, OPTIMIZER AND LOSS================================="""
if torch.cuda.is_available() == True:
CUDA=CUDA
else:
CUDA=False
#Set database
database = data.BinaryData(traindir,data.DRN_transforms(),ade_dir)
iterations = int(math.floor(database.__len__()/(N*BATCH_SIZE)))
#Set dataloader
#Set model
model = SoP.SoP_model(True,cuda=CUDA,n_images=N,GT_MASK=GT_MASK)
optimizer = torch.optim.SGD([{'params': model.unet_model.parameters()},{'params': model.audio_s.parameters()}, {'params': model.drn_model.parameters(), 'lr': 1e-4}], LR,
momentum=MOMENTUM,
weight_decay=WEIGTH_DECAY)
def init_weights(m):
if type(m) == nn.Conv2d:
nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('conv2d'))
if Pretrained:
model.load_state_dict(torch.load('model_dic.pt'))
# model = torch.load('convergence.pt')
else:
model.apply(init_weights)
if CUDA:
model = torch.nn.DataParallel(model).cuda()
# define loss function (criterion) and pptimizer
if CUDA:
if (GT_MASK == 'Ratio'):
criterion = torch.nn.L1Loss().cuda()
if (GT_MASK == 'Binary') :
criterion = torch.nn.BCELoss().cuda()
# criterion = torch.nn.BCEWithLogitsLoss()
else:
if (GT_MASK == 'Ratio') :
criterion = torch.nn.L1Loss()
if (GT_MASK == 'Binary') :
criterion = torch.nn.BCEWithLogitsLoss
batch_time = AverageMeter()
data_time = AverageMeter()
batch_loss = AverageMeter(hist =True)
epoch_loss = AverageMeter(hist = True)
model.train()
end = time.time()
"""============================TRAINING PART================================="""
for t in range(EPOCHS):
loader = data.DataLoader(database,N,batch_size=BATCH_SIZE,Gt_mask=GT_MASK)
for j in range(iterations):
# Forward pass: compute predicted y by passing x to the model. Module objects
# override the __call__ operator so you can call them like functions. When
# doing so you pass a Tensor of input data to the Module and it produces
# a Tensor of output data.
audio,video,gt = loader()
data_time.update(time.time() - end)
if CUDA:
gt=torch.autograd.Variable(gt.cuda())
video=torch.autograd.Variable(video.cuda())
# audio = torch.autograd.Variable(audio)
audio = torch.autograd.Variable(audio.cuda())
else:
gt=torch.autograd.Variable(gt)
video=torch.autograd.Variable(video)
audio = torch.autograd.Variable(audio)
output = model(video,audio)
if (GT_MASK == 'Ratio') or (GT_MASK == 'Ratio_noL10'):
loss = criterion(output, gt.float())
if (GT_MASK == 'Binary') or (GT_MASK == 'BinaryComplex'):
loss = criterion(output, gt.float())
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
batch_loss.update(loss.data.item())
batch_time.update(time.time() - end)
end = time.time()
logger.info('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})'.format(
t, j, iterations, batch_time=batch_time,
data_time=data_time, loss=batch_loss))
epoch_loss.update(loss.data.item())
torch.save(model.state_dict(), filename+'/state_dic.pt')
Does anyone knows why this happens?