Model changing despite zero loss

This is my training function:

def ewc_loss(logits, targets, lamda, fishers, prev_opt_thetas, cur_thetas):
	loss_l = 0
	for i in range(len(fishers)):
		fisher = fishers[i]
		prev_opt_theta = prev_opt_thetas[i]
		cur_theta = cur_thetas[i]
		loss_l = loss_l+ torch.sum(fisher * ((prev_opt_theta-cur_theta)**2))
	return lamda/2 * loss_l

def train_ewc(model, device, train_loader, optimizer, base_loss_fn,
			lamda, fishers, prev_opt_thetas, epoch, other_loaderA, other_loaderB, description=""):
	model.train()
	loss_train = 0
	loss_ewc_total=0
	loss_cross_b=0
	pbar = tqdm(train_loader)
	pbar.set_description(description)
	# freeze_layers(0, model)
	for inputs, targets in pbar:
		inputs, targets = inputs.to(device), targets.to(device)
		cur_thetas = list(model.parameters())
		optimizer.zero_grad()
		logits = model(inputs)
		loss_crossentropy = base_loss_fn(logits, targets)
		loss_ewc = ewc_loss(logits, targets, lamda, fishers,
							prev_opt_thetas, cur_thetas)

		total_loss_combined = loss_crossentropy + loss_ewc
		# total_loss_combined= torch.Tensor([0]).type_as(loss_crossentropy.data)
		print("total loss is " + str(total_loss_combined)+" EWC penalty is " + str(loss_ewc)+" cross entropy  is " + str(loss_crossentropy))
		loss_train += total_loss_combined.item()
		loss_ewc_total += loss_ewc.item()
		loss_cross_b +=loss_crossentropy.item()
		total_loss_combined.backward()
		print("loss grad is ", total_loss_combined)
		optimizer.step()
		# for param in list(model.parameters())[0:5]:
		# 	print(param.grad)
	loss_testB, acc_testB = test(model,device, other_loaderB, base_loss_fn,description="Test on task B")
	print()

	loss_testB, acc_testB = test(model,device, other_loaderA, base_loss_fn,description="Test on task A")

	print()

So, when I set my total_loss_combined = loss_crossentropy - loss_crossentropy which is 0, my model results in significantly changed accuracies on task A and B. Accuracy on task A goes from 71% to 28% and that of B goes from 56% to 72% just after 1 epoch of training on B. This is really strange since my loss function is 0.

The strangest part being that when my total_loss_combined is the one stated above, I get the same results as mentioned above even though total_loss_combined is not zero and nor is any of loss_crossentropy or loss_ewc.

I would recommend to check the optimizer first and see if its using internal running estimates (e.g. as Adam does).
If that’s the case, the parameters might be updated regardless of their gradients, and you should set the gradients to None.

Let me know, if that helps somehow.

1 Like

Hey, so It didnt work. Let me repose the problem in a different way:

def train_ewc(train_loader,lamda, fishers, prev_opt_thetas, epoch,description=""):
	# loss_testA, acc_testA = test(testA_loader,description="Test on task A")
	# print()
	# loss_testB, acc_testB = test(testB_loader,description="Test on task B")
	# print()
	base_model.train()


	loss_train = 0
	pbar = tqdm(train_loader)
	pbar.set_description(description)
	# freeze_layers(0, base_model)
	for param in base_model.parameters():
		param.requires_grad = False



	count=0
	for inputs, targets in pbar:
		# for name, param in base_model.named_parameters():
		# 	print(name, param.requires_grad)
		inputs, targets = inputs.to(device), targets.to(device)

		optimizer.zero_grad()
		logits = base_model(inputs)
		cur_thetas = list(base_model.parameters())
		loss_ewc = ewc_loss(logits, targets, lamda, fishers,
							prev_opt_thetas, cur_thetas)

		loss_cross_entropy = base_loss_fn(logits, targets)

		
		total_loss = loss_cross_entropy + loss_ewc
		print(" Total loss is " + str(total_loss)+" EWC loss is " + str(loss_ewc)+" cross entropy  is " + str(loss_cross_entropy))
		loss_train += total_loss.item()
		# total_loss.backward()
		# optimizer.step()

	print("total loss is: ",loss_train )
	loss_train /= len(train_loader)

	loss_testA, acc_testA = test(testA_loader,description="Test on task A")

This is my code, even when I commented out optimizer.step and loss.backward, still my model is changing. How is this possible. I even set require grad of each layer to false, still the model is changing.

If you are using dropout or batchnorm layers, you would have to call `model.eval().
Otherwise the running estimates of the batchnorm layers will be updated and dropout will be applied.

Gotcha. Thanks a lot for your response :slight_smile: