for epoch in range(initial_epoch, max_epochs):
start = time.time()
total_loss = 0
for batch, sample_batched in enumerate(dataloader):
batch_RGBsT, batch_trimapsT, batch_alphasT, batch_BGsT, batch_FGsT, RGBs_with_meanT = Variable(sample_batched['batch_RGBsT']),Variable(sample_batched['batch_trimapsT']),Variable(sample_batched['batch_alphasT']), Variable(sample_batched['batch_BGsT']), Variable(sample_batched['batch_FGsT']), Variable(sample_batched['RGBs_with_meanT'])
if USE_CUDA:
batch_RGBsT, batch_trimapsT, batch_alphasT, batch_BGsT, batch_FGsT, RGBs_with_meanT = [batch_RGBsT.cuda(), batch_trimapsT.cuda(), batch_alphasT.cuda(), batch_BGsT.cuda(), batch_FGsT.cuda(), RGBs_with_meanT.cuda()]
# initilize gradients
#print(batch_RGBsT.shape, batch_trimapsT.shape)
optimizer.zero_grad()
b_input = torch.cat((batch_RGBsT,batch_trimapsT),1)
# predictions
alpha_loss = model(b_input, batch_alphasT, batch_trimapsT)
alpha_loss = alpha_loss.mean()
#print(alpha_loss)
alpha_loss.backward()
total_loss += alpha_loss
optimizer.step()
print_freq = 1000
if(batch % print_freq == 0 and not batch==0):
print('Epoch:',epoch,'Batch:', batch, 'Loss:',total_loss/float(print_freq))
total_loss = 0
#test()
is_best = best_prec1 > total_loss/float(print_freq)
save_checkpoint({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
'best_prec1': total_loss/float(print_freq),
'optimizer' : optimizer.state_dict(),
}, is_best)
end = time.time()
print('Time for 1 epoch: ', end-start)
Above code runs over 43100 set of images with a batch size of 16, but once a epoch is completed leads to exceeded job memory limited error. If the whole code could successfully run over a 43100 images in one epoch. what could lead it to job memory limit exceeded error in second epoch?