Differences between gradient calculated by different reduction methods

I’m playing with different reduction methods provided in built-in loss functions. In particular, I would like to compare the following.

  • The averaged gradient by performing backward pass for each loss value calculated with reduction="none"
  • The gradient averaged by dividing the batch size with reduction="sum"
  • The average gradient yielded by reduction="mean"
  • The average gradient calculated by reduction="mean", with the data points fed into the model one at a time.

My code for producing the experiment is as follows:

def estimate_gradient(model, optimizer, batch):
	criterion_no_reduction = nn.CrossEntropyLoss(reduction="none").cuda()
	criterion_sum = nn.CrossEntropyLoss(reduction="sum").cuda()
	criterion_avg = nn.CrossEntropyLoss().cuda()

	input, target = batch
	input, target = input.cuda(), target.cuda()
	output = model(input)
	n = len(output)

	loss_no_reudction = criterion_no_reduction(output, target)
	grad_list_no_reduction = []
	for i in range(n):
		optimizer.zero_grad()
		loss_no_reudction[i].backward(retain_graph=True)
		for j, param in enumerate(model.parameters()):
			if param.requires_grad:
				grad = param.grad.view(-1, 1)
				if i == 0:
					grad_list_no_reduction.append(grad)
				else:
					grad_list_no_reduction[j] = torch.cat((grad_list_no_reduction[j], grad), dim=1)
	grad_out_no_reduction = torch.cat(grad_list_no_reduction, dim=0)
	grad_out_no_reduction = (torch.sum(grad_out_no_reduction, dim=1) / n).cpu().detach().numpy().flatten()

	loss_sum = criterion_sum(output, target)
	optimizer.zero_grad()
	loss_sum.backward(retain_graph=True)
	for j, param in enumerate(model.parameters()):
		if param.requires_grad:
			if j == 0:
				grad_list_sum = param.grad.view(-1)
			else:
				grad_list_sum = torch.cat((grad_list_sum, param.grad.view(-1)))
	grad_out_sum = (grad_list_sum / n).cpu().detach().numpy().flatten()

	loss_avg = criterion_avg(output, target)
	optimizer.zero_grad()
	loss_avg.backward(retain_graph=True)
	for j, param in enumerate(model.parameters()):
		if param.requires_grad:
			if j == 0:
				grad_list_avg = param.grad.view(-1)
			else:
				grad_list_avg = torch.cat((grad_list_avg, param.grad.view(-1)))
	grad_out_avg = grad_list_avg.cpu().detach().numpy().flatten()

	target = target.view(-1, 1)
	grad_list_one_by_one = []
	for i in range(n):
		optimizer.zero_grad()
		curr_output = output[i].view(1, -1)
		loss = criterion_avg(curr_output, target[i])
		loss.backward(retain_graph=True)
		for j, param in enumerate(model.parameters()):
			if param.requires_grad:
				grad = param.grad.view(-1, 1)
				if i == 0:
					grad_list_one_by_one.append(grad)
				else:
					grad_list_one_by_one[j] = torch.cat((grad_list_one_by_one[j], grad), dim=1)
	grad_out_one_by_one = torch.cat(grad_list_one_by_one, dim=0)
	grad_out_one_by_one = (torch.sum(grad_out_one_by_one, dim=1) / n).cpu().detach().numpy().flatten()
	
	assert grad_out_no_reduction.shape == grad_out_sum.shape == grad_out_avg.shape == grad_out_one_by_one.shape
	print("Maximum discrepancy between reduction = none and sum: {}".format(np.max(np.abs(grad_out_no_reduction - grad_out_sum))))
	print("Maximum discrepancy between reduction = none and avg: {}".format(np.max(np.abs(grad_out_no_reduction - grad_out_avg))))
	print("Maximum discrepancy between reduction = none and one-by-one: {}".format(np.max(np.abs(grad_out_no_reduction - grad_out_one_by_one))))
	print("Maximum discrepancy between reduction = sum and avg: {}".format(np.max(np.abs(grad_out_sum - grad_out_avg))))
	print("Maximum discrepancy between reduction = sum and one-by-one: {}".format(np.max(np.abs(grad_out_sum - grad_out_one_by_one))))
	print("Maximum discrepancy between reduction = avg and one-by-one: {}".format(np.max(np.abs(grad_out_avg- grad_out_one_by_one))))

The results are as follows:

Maximum discrepancy between reduction = none and sum: 0.0316
Maximum discrepancy between reduction = none and avg: 0.0316
Maximum discrepancy between reduction = none and one-by-one: 0.0
Maximum discrepancy between reduction = sum and avg: 0.0
Maximum discrepancy between reduction = sum and one-by-one: 0.0316
Maximum discrepancy between reduction = avg and one-by-one: 0.0316

That is, the result produced by reduction=none and one-by-one backward pass appear to be identical, while reduciton=sum and reduction=mean yields different results from the previous pair. It would be really helpful to explain the discrepancy (maybe due to retain_graph=True?) and thanks in advance for any help!