I was initially trying to max out my gpu memory usage and using high batch sizes (~ 300 → 4000) but then I noticed my GPU utilization was close to 0% most of the time and just spikes to 99% very once in a while. I kept lowering the batch size until about ~ 20 and then I finally see GPU utilization maintaining above 90% most of the time. Is this normal? How can I utilize my hardware most efficiently?
My model is a convolution image classifier for planets. I have a 3 convolution layers and 3 linear layers. I have 3,500 original planet images that I expanded by rotation to a total of ~ 18,000 labelled image examples. Is there some hardware limitation I’m hitting? Just trying to get a grasp of what is going on and how to get a setup that properly utilizes the hardware. Thanks for any help.
My training script is below:
import torch
import torchvision
import matplotlib.pyplot as plt
import torch.optim as optim
import torch.nn.functional as F
from torch import nn
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import time
import hashlib
import os
import sys
import PlanetNet as n6
from utils import *
epochs = 100
learning_rate = 0.001
train_batch_size = 256
test_batch_size = 256
test_num = 2048
loss_dir = "loss_data"
model_file = "model.pt"
model_class = "PlanetNet.py"
device = "cpu"
if torch.cuda.is_available():
device = "cuda"
else:
print("Error, GPU's unavailable!")
sys.exit()
dc = torch.cuda.device_count()
dev_name = torch.cuda.get_device_name()
if not os.path.exists(loss_dir):
os.makedirs(loss_dir)
print("Using model: ", model_file)
print("Model Arch ID: ", get_id(model_class))
print("Using device: ", device)
print("Num devices: ", dc)
print("Device name: ", dev_name)
model = n6.PlanetNet()
model = nn.DataParallel(model).to(device)
xform = transforms.Compose([
# add a resize transform
transforms.Resize((100,100)),
transforms.ToTensor(),
transforms.Grayscale()
])
raw_data = datasets.ImageFolder(root="images-aug/planets", transform=xform)
print("Image Classes:")
print(raw_data.find_classes("images-aug/planets")[1])
dataz = torch.utils.data.random_split(raw_data, [test_num, len(raw_data)-test_num])
raw_test = dataz[0]
raw_train = dataz[1]
# print the size of the training and test sets
print("Training set size: ", len(raw_train))
print("Test set size: ", len(raw_test))
test_loader = torch.utils.data.DataLoader(raw_test, shuffle=True, num_workers=8, batch_size=test_batch_size)
train_loader = torch.utils.data.DataLoader(raw_train, shuffle=True, num_workers=32, batch_size=train_batch_size)
epochz = []
train_loss = []
test_loss = []
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train()
loss = 0
for batch, (X,y) in enumerate(dataloader):
X,y = X.to(device), y.to(device)
yp = model(X)
loss = loss_fn(yp, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
train_loss.append(loss.item())
def test(dataloader, model, loss_fn):
size = len(dataloader.dataset)
model.eval()
loss, correct = 0, 0
with torch.no_grad():
for X,y in dataloader:
X,y = X.to(device), y.to(device)
yp = model(X)
loss += loss_fn(yp, y).item()
correct += (yp.argmax(1) == y).type(torch.float).sum().item()
loss /= size
correct /= size
print(f"")
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {loss:>8f} \n")
test_loss.append(loss)
# get current time
start_time = time.time()
for t in range(epochs):
print(f"Epoch {t+1} -------------------------------")
train(train_loader, model, loss_fn, optimizer)
test(test_loader, model, loss_fn)
epochz.append(t)
end_time = time.time()
dur = end_time - start_time
print("Done, runtime: ", dur, " seconds")
# save the model state dict
torch.save(model.state_dict(), model_file)
model_id = get_id(model_class)
loss_file = "loss_" + model_id + "_" + time.strftime("%Y-%m-%d_%H-%M-%S", time.gmtime(end_time)) + ".csv"
print(f"Saving loss data to {loss_dir}/{loss_file}")
# save train and test loss to csv
with open(loss_dir+"/"+loss_file, "w") as f:
f.write("gpus, type\n")
f.write(f"{dc},{dev_name}\n")
f.write("train data,test data,epochs,lr,train bs,test bs,train dur (s)\n")
f.write(f"{len(raw_train)},{len(raw_test)},{epochs},{learning_rate},{train_batch_size},{test_batch_size},{dur}\n")
f.write("epoch,train loss,test loss\n")
for i in range(epochs):
f.write(f"{epochz[i]},{train_loss[i]},{test_loss[i]}\n")
# plot training and test loss
plt.plot(epochz, train_loss, label="Training Loss")
plt.plot(epochz, test_loss, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()