Testing model swtiches on CPU

Hello,

I am fairly new to pytorch, so I’m not sure I’m doing things correctly…
I am using pytorch = 1.7.1

I have the following code for training/testing a Unet, I’ve made sure to set everything to the device that is set to CUDA, I even checked it by print(var.is_cuda). However, for some reason, during the testing part, my GPU usage drops to 0 and my CPU goes to 100%, though everything is set to CUDA.

I am looking for any explanation and help to fix it :slight_smile:

def train_model(model,
				device,
				crit="mse",
				opti="adam",
				dataset_file: str = "dict_wds-1201-0331-dec-march",
				num_epochs: int = 176,
				batch_size: int = 1,
				learning_rate: float = 1e-3,
				gen_load: bool = False,
				checkpoint_save: bool = True,
				):


	train_loader, test_loader, train_dataset, test_dataset = load_data(batch_size, dataset_file=dataset_file, discretize=discretize)
	n_train = len(train_dataset)
	n_test = len(test_dataset)

	writer = SummaryWriter(comment=f'LR_{learning_rate}_BS_{batch_size}_E_{num_epochs}_C_{crit}_O_{opti}_L_{n_train}_D_{discretize}')
	global_step = 0
	ets_benckmark = 0.0

	print("params #: ", sum(x.numel() for x in model.parameters()))

	criterion = nn.MSELoss()

	criterion = criterion.to(device)

	optimizer = torch.optim.Adam(
		model.parameters(),
		lr=learning_rate,
		weight_decay=1e-8
	)

	scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 2)

	for epoch in range(num_epochs):
		running_loss = 0.0
		loss_values = []
               
               # TRAINING PART

		with tqdm(train_loader, desc=f"Epoch [{epoch + 1}/{num_epochs}]", colour="#009B77") as t:
			for i, (ecmwf, truth) in enumerate(t):
				model.train()

				ecmwf = ecmwf.to(device, dtype=torch.float32)
				truth = truth.to(device, dtype=torch.float32)

				optimizer.zero_grad()
				outputs = model(ecmwf)

				if crit == "rmse":
					loss = torch.sqrt(criterion(outputs, truth))
				else:
					loss = criterion(outputs, truth)
				running_loss += loss.item()

				loss_values.append(loss.item())
				loss.backward()

				nn.utils.clip_grad_value_(model.parameters(), 0.1)
				optimizer.step()

				t.set_postfix(**{"iter": f"{i + 1}/{n_train // batch_size}", "batch loss": loss.item(),
								"avg loss": sum(loss_values)/n_train})
				writer.add_scalar("Loss/Train", loss.item(), global_step)

				global_step += 1


                                # TESTING PART

				if global_step % (n_train // (batch_size)) == 0:
					tot_l1_loss = 0
					tot_mse = 0
					tot_ets = 0
					with tqdm(test_loader, desc=f"Test", colour="#D2691E") as tl:
						for k, (ecmwf, truth) in enumerate(tl):
							for name, param in model.named_parameters():

								name = name.replace(".", "/")
								writer.add_histogram("Weights/" + name, param.data.cpu().numpy(), global_step)
								writer.add_histogram("Gradients/" + name, param.grad.cpu().numpy(), global_step)

							model.eval()
							with torch.no_grad():
								ecmwf = ecmwf.to(device)
								truth = truth.to(device)

								outputs = model(ecmwf)

								tot_mse += F.mse_loss(truth, outputs)

							# print(f"Validation mse: {tot_mse / len(test_loader)}")
							writer.add_scalar('mse/test', tot_mse / len(test_loader), global_step)

							tl.set_postfix(**{
								"Validation mse": f"{tot_mse / len(test_loader)}",
							})

							if k == len(test_loader)-1:
								writer.add_images('truth', truth, global_step)
								writer.add_images('pred', outputs, global_step)

					
					val_loss = tot_mse / len(test_loader)

		scheduler.step(val_loss)

		print(f"ETS benchmark avg on train_set (ecmwf/cmorph): {ets_benckmark / (n_train)}")

		if checkpoint_save:
			torch.save(model.state_dict(), f"./cp.pth")
			print(f"Checkpoint reached! :)")

	writer.close()

def main():
	args = arg_parse()

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	torch.cuda.empty_cache()

	if device.type == "cuda":
		torch.backends.cudnn.benchmark = True

	model = my_UNet()

	model.to(device=device)

	train_model()

Your system might be busy with CPU operations (such as data loading) and the GPU might thus need to wait. You could profile the code and check, if e.g. the data loading is indeed using the CPU resources until the GPU can execute the forward pass (or alternatively remove all data loading, create random tensors on the GPU, execute the forward pass, and check nvidia-smi again).

I have tried your suggestion, but unfortunately, it did not solve the issue. Even with random tensors on the GPU it still switches to the CPU for the the testing.

I have profiled the code and it seems the bottleneck is for <method 'sort' of 'numpy.ndarray' objects>

PyTorch won’t switch to the CPU without manually specifying the device so it seems you are still executing CPU operations.

'sort' of 'numpy.ndarray' objects indeed points to a numpy operation, which will be executed on the CPU.

I’ve figured it out

In the testing loop I was calling:

writer.add_histogram("Weights/" + name, param.data.cpu().numpy(), global_step)
writer.add_histogram("Gradients/" + name, param.grad.cpu().numpy(), global_step)

This was calling the 'sort' of 'numpy.ndarray that was slowing down everything