Here is the corrected code
for i in range(num_iters):
optimizer.zero_grad()
batch_loss_value = 0
for m in range(M):
(images, labels, indices) = train_loader.next():
outputs = net(Variable(images.cuda()))
loss = criterion(outputs, Variable(labels.cuda()))
loss.backward()
batch_loss_value += loss.cpu().numpy()[0]
optimizer.step()
batch_loss_value = batch_loss_value/M