I am experiencing the same issue, mentioned in this post
Here is my code :
net.train()
current_lr = config['learning_rate']
print('Loading Dataset...')
mot_root_temp = config['mot_root'] #'/home/vca_shounak/VCA/SST/data/MOT17/'
dataset = MyTrainDataSet(mot_root_temp,
SSJAugmentation(
sst_dim, means
)
)
print("dataset len ==> {} ".format(len(dataset)))
epoch_size = len(dataset) // args.batch_size
print('Training SSJ on', dataset.dataset_name)
step_index = 0
batch_iterator = None
print("batch_size = {} and epoch length = {} ".format(batch_size,epoch_size))
data_loader = data.DataLoader(dataset, batch_size,
num_workers=args.num_workers,
shuffle=True,
collate_fn=collate_fn,
pin_memory=False)
print("start_iter = {} , max_iter = {} ".format(args.start_iter,max_iter))
#batch_counter = 0
#with torch.no_grad():
for iteration in range(args.start_iter, max_iter):
if (not batch_iterator) or (iteration % epoch_size == 0):
# create batch iterator
batch_iterator = iter(data_loader)
print("batch_iterator = {} ".format(batch_iterator))
all_epoch_loss = []
if iteration in stepvalues:
step_index += 1
current_lr = adjust_learning_rate(optimizer, args.gamma, step_index)
# load train data
img_pre, img_next, boxes_pre, boxes_next, labels, valid_pre, valid_next=next(batch_iterator)
if args.cuda:
img_pre = Variable(img_pre.cuda())
img_next = Variable(img_next.cuda())
boxes_pre = Variable(boxes_pre.cuda())
boxes_next = Variable(boxes_next.cuda())
valid_pre = Variable(valid_pre.cuda(),volatile=True)
valid_next = Variable(valid_next.cuda(),volatile=True)
labels = Variable(labels.cuda(),volatile=True)
else:
img_pre = Variable(img_pre)
img_next = Variable(img_next)
boxes_pre = Variable(boxes_pre)
boxes_next = Variable(boxes_next)
valid_pre = Variable(valid_pre)
valid_next = Variable(valid_next)
labels = Variable(labels, volatile=True)
# forward
t0 = time.time()
out = net(img_pre, img_next, boxes_pre, boxes_next, valid_pre, valid_next)
optimizer.zero_grad()
loss_pre, loss_next, loss_similarity, loss, accuracy_pre, accuracy_next, accuracy, predict_indexes = criterion(out, labels, valid_pre, valid_next)
loss.backward()
optimizer.step()
t1 = time.time()
all_epoch_loss += [loss.data.cpu()] #.cpu()
if iteration % 10 == 0:
print('Timer: %.4f sec.' % (t1 - t0))
print('iter ' + repr(iteration) + ', ' + repr(epoch_size) + ' || epoch: %.4f ' % (iteration/(float)(epoch_size)) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ')
if args.tensorboard:
if len(all_epoch_loss) > 30:
writer.add_scalar('data/epoch_loss', float(np.mean(all_epoch_loss)), iteration)
writer.add_scalar('data/learning_rate', current_lr, iteration)
writer.add_scalar('loss/loss', loss.data.cpu(), iteration) #.cpu()
writer.add_scalar('loss/loss_pre', loss_pre.data.cpu(), iteration) #.cpu()
writer.add_scalar('loss/loss_next', loss_next.data.cpu(), iteration) #.cpu()
writer.add_scalar('loss/loss_similarity', loss_similarity.data.cpu(), iteration) #.cpu()
writer.add_scalar('accuracy/accuracy', accuracy.data.cpu(), iteration) #.cpu()
writer.add_scalar('accuracy/accuracy_pre', accuracy_pre.data.cpu(), iteration) #.cpu()
writer.add_scalar('accuracy/accuracy_next', accuracy_next.data.cpu(), iteration) #.cpu()
# add weights
if iteration % 1000 == 0:
for name, param in net.named_parameters():
writer.add_histogram(name, param.clone().cpu().data.numpy(), iteration) #.cpu()
# add images
if args.send_images and iteration % 1000 == 0:
result_image = show_batch_circle_image(img_pre, img_next, boxes_pre, boxes_next, valid_pre, valid_next, predict_indexes, iteration)
writer.add_image('WithLabel/ImageResult', vutils.make_grid(result_image, nrow=2, normalize=True, scale_each=True), iteration)
if iteration % save_weights_iteration == 0:
print('Saving state, iter:', iteration)
torch.save(sst_net.state_dict(),
os.path.join(
args.save_folder,
'sst300_0712_' + repr(iteration) + '.pth'))
torch.save(sst_net.state_dict(), args.save_folder + '' + args.version + '.pth')
And here is the full exception stacktrace :
Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7f42550638d0>>
Traceback (most recent call last):
File "/home/shounak/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 399, in __del__
self._shutdown_workers()
File "/home/shounak/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 378, in _shutdown_workers
self.worker_result_queue.get()
File "/home/shounak/anaconda3/lib/python3.6/multiprocessing/queues.py", line 337, in get
return _ForkingPickler.loads(res)
File "/home/shounak/anaconda3/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd
fd = df.detach()
File "/home/shounak/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/home/shounak/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/home/shounak/anaconda3/lib/python3.6/multiprocessing/connection.py", line 493, in Client
answer_challenge(c, authkey)
File "/home/shounak/anaconda3/lib/python3.6/multiprocessing/connection.py", line 737, in answer_challenge
response = connection.recv_bytes(256) # reject large message
File "/home/shounak/anaconda3/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/shounak/anaconda3/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/home/shounak/anaconda3/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Can anyone please help me ?