Can't get both high GPU memory & utilization

I was initially trying to max out my gpu memory usage and using high batch sizes (~ 300 → 4000) but then I noticed my GPU utilization was close to 0% most of the time and just spikes to 99% very once in a while. I kept lowering the batch size until about ~ 20 and then I finally see GPU utilization maintaining above 90% most of the time. Is this normal? How can I utilize my hardware most efficiently?

My model is a convolution image classifier for planets. I have a 3 convolution layers and 3 linear layers. I have 3,500 original planet images that I expanded by rotation to a total of ~ 18,000 labelled image examples. Is there some hardware limitation I’m hitting? Just trying to get a grasp of what is going on and how to get a setup that properly utilizes the hardware. Thanks for any help.
My training script is below:

import torch
import torchvision
import matplotlib.pyplot as plt
import torch.optim as optim
import torch.nn.functional as F
from torch import nn
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import time
import hashlib
import os
import sys

import PlanetNet as n6
from utils import *

epochs = 100
learning_rate = 0.001
train_batch_size = 256
test_batch_size = 256
test_num = 2048

loss_dir = "loss_data"
model_file = "model.pt"
model_class = "PlanetNet.py"

device = "cpu"
if torch.cuda.is_available():
	device = "cuda"
else:
	print("Error, GPU's unavailable!")
	sys.exit()

dc = torch.cuda.device_count()
dev_name = torch.cuda.get_device_name()

if not os.path.exists(loss_dir):
	os.makedirs(loss_dir)


print("Using model: ", model_file)
print("Model Arch ID: ", get_id(model_class))
print("Using device: ", device)
print("Num devices: ", dc)
print("Device name: ", dev_name)

model = n6.PlanetNet()
model = nn.DataParallel(model).to(device)

xform = transforms.Compose([
	# add a resize transform
	transforms.Resize((100,100)),
	transforms.ToTensor(),
	transforms.Grayscale()
])

raw_data = datasets.ImageFolder(root="images-aug/planets", transform=xform)
print("Image Classes:")
print(raw_data.find_classes("images-aug/planets")[1])

dataz = torch.utils.data.random_split(raw_data, [test_num, len(raw_data)-test_num])
raw_test = dataz[0]
raw_train = dataz[1]

# print the size of the training and test sets
print("Training set size: ", len(raw_train))
print("Test set size: ", len(raw_test))

test_loader = torch.utils.data.DataLoader(raw_test, shuffle=True, num_workers=8, batch_size=test_batch_size)
train_loader = torch.utils.data.DataLoader(raw_train, shuffle=True, num_workers=32, batch_size=train_batch_size)

epochz = []
train_loss = []
test_loss = []

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

def train(dataloader, model, loss_fn, optimizer):
	size = len(dataloader.dataset)
	model.train()
	loss = 0
	for batch, (X,y) in enumerate(dataloader):
		X,y = X.to(device), y.to(device)
		yp = model(X)
		loss = loss_fn(yp, y)
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()
		if batch % 100 == 0:
			loss, current = loss.item(), batch * len(X)
			print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
	train_loss.append(loss.item())

def test(dataloader, model, loss_fn):
	size = len(dataloader.dataset)
	model.eval()
	loss, correct = 0, 0
	with torch.no_grad():
		for X,y in dataloader:
			X,y = X.to(device), y.to(device)
			yp = model(X)
			loss += loss_fn(yp, y).item()
			correct += (yp.argmax(1) == y).type(torch.float).sum().item()
	loss /= size
	correct /= size
	print(f"")
	print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {loss:>8f} \n")
	test_loss.append(loss)

# get current time
start_time = time.time()

for t in range(epochs):
	print(f"Epoch {t+1} -------------------------------")
	train(train_loader, model, loss_fn, optimizer)
	test(test_loader, model, loss_fn)
	epochz.append(t)

end_time = time.time()
dur = end_time - start_time
print("Done, runtime: ", dur, " seconds")

# save the model state dict
torch.save(model.state_dict(), model_file)

model_id = get_id(model_class)

loss_file = "loss_" + model_id + "_" + time.strftime("%Y-%m-%d_%H-%M-%S", time.gmtime(end_time)) + ".csv"

print(f"Saving loss data to {loss_dir}/{loss_file}")

# save train and test loss to csv
with open(loss_dir+"/"+loss_file, "w") as f:
	f.write("gpus, type\n")
	f.write(f"{dc},{dev_name}\n")
	f.write("train data,test data,epochs,lr,train bs,test bs,train dur (s)\n")
	f.write(f"{len(raw_train)},{len(raw_test)},{epochs},{learning_rate},{train_batch_size},{test_batch_size},{dur}\n")
	f.write("epoch,train loss,test loss\n")
	for i in range(epochs):
		f.write(f"{epochz[i]},{train_loss[i]},{test_loss[i]}\n")

# plot training and test loss
plt.plot(epochz, train_loss, label="Training Loss")
plt.plot(epochz, test_loss, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

You might be running into a data loading bottleneck by increasing the batch size massively as your CPU might not be able to scale with the increased workload compared to the GPU.
As a quick test could be to remove the actual data loading, initialize a single (large) batch of random inputs, and check the GPU utilization.
If it does not suffer from the short utilization peaks it would indeed point to a slow data loading and you could check if e.g. increasing the number of workers could help.

I happen to have an older system (supermicro 1027gr-trf) with 2 older xenon processors but with 24 cores each, 32gb ram, and I fit in 2x nvidia a4000 gpus (16gb vram). I increased the num_workers to 48 and it helps a lot but doesn’t solve the problem at higher batch sizes. I will try the random tensor data, thanks