Can't get both high GPU memory & utilization

I was initially trying to max out my gpu memory usage and using high batch sizes (~ 300 → 4000) but then I noticed my GPU utilization was close to 0% most of the time and just spikes to 99% very once in a while. I kept lowering the batch size until about ~ 20 and then I finally see GPU utilization maintaining above 90% most of the time. Is this normal? How can I utilize my hardware most efficiently?

My model is a convolution image classifier for planets. I have a 3 convolution layers and 3 linear layers. I have 3,500 original planet images that I expanded by rotation to a total of ~ 18,000 labelled image examples. Is there some hardware limitation I’m hitting? Just trying to get a grasp of what is going on and how to get a setup that properly utilizes the hardware. Thanks for any help.
My training script is below:

import torch
import torchvision
import matplotlib.pyplot as plt
import torch.optim as optim
import torch.nn.functional as F
from torch import nn
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import time
import hashlib
import os
import sys

import PlanetNet as n6
from utils import *

epochs = 100
learning_rate = 0.001
train_batch_size = 256
test_batch_size = 256
test_num = 2048

loss_dir = "loss_data"
model_file = ""
model_class = ""

device = "cpu"
if torch.cuda.is_available():
	device = "cuda"
	print("Error, GPU's unavailable!")

dc = torch.cuda.device_count()
dev_name = torch.cuda.get_device_name()

if not os.path.exists(loss_dir):

print("Using model: ", model_file)
print("Model Arch ID: ", get_id(model_class))
print("Using device: ", device)
print("Num devices: ", dc)
print("Device name: ", dev_name)

model = n6.PlanetNet()
model = nn.DataParallel(model).to(device)

xform = transforms.Compose([
	# add a resize transform

raw_data = datasets.ImageFolder(root="images-aug/planets", transform=xform)
print("Image Classes:")

dataz =, [test_num, len(raw_data)-test_num])
raw_test = dataz[0]
raw_train = dataz[1]

# print the size of the training and test sets
print("Training set size: ", len(raw_train))
print("Test set size: ", len(raw_test))

test_loader =, shuffle=True, num_workers=8, batch_size=test_batch_size)
train_loader =, shuffle=True, num_workers=32, batch_size=train_batch_size)

epochz = []
train_loss = []
test_loss = []

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

def train(dataloader, model, loss_fn, optimizer):
	size = len(dataloader.dataset)
	loss = 0
	for batch, (X,y) in enumerate(dataloader):
		X,y =,
		yp = model(X)
		loss = loss_fn(yp, y)
		if batch % 100 == 0:
			loss, current = loss.item(), batch * len(X)
			print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
	size = len(dataloader.dataset)
	loss, correct = 0, 0
	with torch.no_grad():
		for X,y in dataloader:
			X,y =,
			yp = model(X)
			loss += loss_fn(yp, y).item()
			correct += (yp.argmax(1) == y).type(torch.float).sum().item()
	loss /= size
	correct /= size
	print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {loss:>8f} \n")

# get current time
start_time = time.time()

for t in range(epochs):
	print(f"Epoch {t+1} -------------------------------")
	train(train_loader, model, loss_fn, optimizer)
	test(test_loader, model, loss_fn)

end_time = time.time()
dur = end_time - start_time
print("Done, runtime: ", dur, " seconds")

# save the model state dict, model_file)

model_id = get_id(model_class)

loss_file = "loss_" + model_id + "_" + time.strftime("%Y-%m-%d_%H-%M-%S", time.gmtime(end_time)) + ".csv"

print(f"Saving loss data to {loss_dir}/{loss_file}")

# save train and test loss to csv
with open(loss_dir+"/"+loss_file, "w") as f:
	f.write("gpus, type\n")
	f.write("train data,test data,epochs,lr,train bs,test bs,train dur (s)\n")
	f.write("epoch,train loss,test loss\n")
	for i in range(epochs):

# plot training and test loss
plt.plot(epochz, train_loss, label="Training Loss")
plt.plot(epochz, test_loss, label="Test Loss")

You might be running into a data loading bottleneck by increasing the batch size massively as your CPU might not be able to scale with the increased workload compared to the GPU.
As a quick test could be to remove the actual data loading, initialize a single (large) batch of random inputs, and check the GPU utilization.
If it does not suffer from the short utilization peaks it would indeed point to a slow data loading and you could check if e.g. increasing the number of workers could help.

I happen to have an older system (supermicro 1027gr-trf) with 2 older xenon processors but with 24 cores each, 32gb ram, and I fit in 2x nvidia a4000 gpus (16gb vram). I increased the num_workers to 48 and it helps a lot but doesn’t solve the problem at higher batch sizes. I will try the random tensor data, thanks