I am finetuning the pretrained AlexNet to classify images of galaxies. I have a dataset of about 14,000 images, and on my base M1 MacBook Air I can train and test over 10 epochs in roughly 35-38 minutes.
I am trying to repeat this procedure remotely on my university computing resources, where I can access a Tesla P100 12GB GPU. I expect this to be much faster than my MacBook Air. However, when I train remotely, it seems to be approximately 1/10 as fast as my MacBook Air. (I don’t have a precise number because I’ve only asked the scheduler for “jobs” that are 2 hours long, and I get booted off before training finishes.) I’m pretty sure this indicates a problem somewhere.
I am trying to figure out whether there is a problem in my PyTorch code, or whether I’m somehow incorrectly using my university resources. Therefore I am sharing my code below in the hopes that someone can tell me whether it has any errors.
I exported all the code from my Jupyter Notebook (run remotely) as Markdown, and pasted it below. A lot of it’s not really relevant, but I thought I’d include all of it. You can see where I confirm that PyTorch sees the GPU, and I also use the .to(device)
method to move the model to the GPU.
Note that my code is a sort of hodgepodge from Sebastian Raschka’s 2022 PyTorch book, and the PyTorch tutorials.
To be clear, the code seems to work when I run it remotely, but it’s just agonizingly slow.
Thanks for any help.
My code:
import torch
import torchvision.transforms as transforms
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pathlib
from PIL import Image
imgdir_path = pathlib.Path('/home/jsa378/NA10/14K/50kpc')
file_list = sorted([str(path) for path in
imgdir_path.glob('*.jpg')])
file_list[0:9]
['/home/jsa378/NA10/14K/50kpc/fpC-000745-40-2-0532-0479.jpg',
'/home/jsa378/NA10/14K/50kpc/fpC-000745-40-3-0529-0342.jpg',
'/home/jsa378/NA10/14K/50kpc/fpC-000745-40-4-0518-0263.jpg',
'/home/jsa378/NA10/14K/50kpc/fpC-000745-40-4-0521-0197.jpg',
'/home/jsa378/NA10/14K/50kpc/fpC-000745-40-5-0518-0184.jpg',
'/home/jsa378/NA10/14K/50kpc/fpC-000745-40-5-0531-0300.jpg',
'/home/jsa378/NA10/14K/50kpc/fpC-000745-40-5-0533-0359.jpg',
'/home/jsa378/NA10/14K/50kpc/fpC-000752-40-1-0145-0069.jpg',
'/home/jsa378/NA10/14K/50kpc/fpC-000752-40-1-0177-0005.jpg']
len(file_list)
14034
torch.cuda.is_available()
torch.cuda.device_count()
torch.cuda.current_device()
torch.cuda.device(0)
torch.cuda.get_device_name(0)
'Tesla P100-PCIE-12GB'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
cuda
namorph = pd.read_table("/home/jsa378/NA10/NairAbrahamMorphology.cat")
namorph.TType
0 3.0
1 -5.0
2 -2.0
3 4.0
4 5.0
...
14029 2.0
14030 2.0
14031 1.0
14032 0.0
14033 3.0
Name: TType, Length: 14034, dtype: float64
labels = torch.zeros(len(namorph.TType), dtype=torch.int64)
for i in range(len(namorph.TType)):
if namorph.TType[i] == -5.0:
labels[i] = 0 # Elliptical
elif -3 <= namorph.TType[i] and namorph.TType[i] <= 0:
labels[i] = 1 # Lenticular
elif 1 <= namorph.TType[i] and namorph.TType[i] <= 9:
labels[i] = 2 # Spiral
elif 10 == namorph.TType[i] or 99 == namorph.TType[i]:
labels[i] = 3 # Irr+Misc
labels
tensor([2, 0, 1, ..., 2, 1, 2])
class ImageDataset():#(Dataset):
def __init__(self, file_list, labels):
self.file_list = file_list
self.labels = labels
def __getitem__(self, index):
file = self.file_list[index]
label = self.labels[index]
return file, label
def __len__(self):
return len(self.labels)
image_dataset = ImageDataset(file_list, labels)
img_height, img_width = 256, 256
size = [224, 224]
torch.manual_seed(17)
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Resize((img_height, img_width)),
transforms.RandomCrop(size),
transforms.RandomHorizontalFlip(),
transforms.RandomVerticalFlip(),
transforms.RandomRotation(180),
])
class ImageDataset():#(Dataset):
def __init__(self, file_list, labels, transform=None):
self.file_list = file_list
self.labels = labels
self.transform = transform
def __getitem__(self, index):
img = Image.open(self.file_list[index])
if self.transform is not None:
img = self.transform(img)
label = self.labels[index]
return img, label
def __len__(self):
return len(self.labels)
image_dataset = ImageDataset(file_list, labels, transform)
len(image_dataset)
14034
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context
# model = torch.hub.load('pytorch/vision:v0.10.0', 'alexnet', pretrained=True)
# model.to(device)
# model.eval()
# if torch.cuda.is_available():
# input_batch = input_batch.to('cuda')
# model.to('cuda')
from __future__ import print_function
from __future__ import division
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, models, transforms
import time
import copy
print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)
PyTorch Version: 1.10.0
Torchvision Version: 0.11.1
data_dir = imgdir_path
model_name = "alexnet"
num_classes = 4
batch_size = 8
num_epochs = 10
feature_extract = False
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
since = time.time()
val_acc_history = []
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
# Get model outputs and calculate loss
# Special case for inception because in training it has an
# auxiliary output. In train mode we calculate the loss by
# summing the final output and the auxiliary output but in
# testing we only consider the final output.
if is_inception and phase == 'train':
# From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
outputs, aux_outputs = model(inputs)
loss1 = criterion(outputs, labels)
loss2 = criterion(aux_outputs, labels)
loss = loss1 + 0.4*loss2
else:
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
if phase == 'val':
val_acc_history.append(epoch_acc)
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model, val_acc_history
def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
for param in model.parameters():
param.requires_grad = False
# model.classifier[6] = nn.Linear(4096,num_classes)
# model.eval()
def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
# Initialize these variables which will be set in this if statement.
# Each of these variables is model specific.
model_ft = None
input_size = 0
if model_name == "alexnet":
""" Alexnet
"""
model_ft = models.alexnet(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.classifier[6].in_features
model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
input_size = 224
else:
print("Invalid model name, exiting...")
exit()
return model_ft, input_size
model_ft, input_size = initialize_model(model_name, num_classes, feature_extract=False, use_pretrained=True)
print(model_ft)
AlexNet(
(features): Sequential(
(0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
(1): ReLU(inplace=True)
(2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(4): ReLU(inplace=True)
(5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(7): ReLU(inplace=True)
(8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(9): ReLU(inplace=True)
(10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): ReLU(inplace=True)
(12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
(classifier): Sequential(
(0): Dropout(p=0.5, inplace=False)
(1): Linear(in_features=9216, out_features=4096, bias=True)
(2): ReLU(inplace=True)
(3): Dropout(p=0.5, inplace=False)
(4): Linear(in_features=4096, out_features=4096, bias=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=4096, out_features=4, bias=True)
)
)
model_ft = model_ft.to(device)
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
params_to_update = []
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
params_to_update.append(param)
print("\t",name)
else:
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
print("\t",name)
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
Params to learn:
features.0.weight
features.0.bias
features.3.weight
features.3.bias
features.6.weight
features.6.bias
features.8.weight
features.8.bias
features.10.weight
features.10.bias
classifier.1.weight
classifier.1.bias
classifier.4.weight
classifier.4.bias
classifier.6.weight
classifier.6.bias
train_size = 12000
test_size = len(image_dataset) - train_size
from torch.utils.data import Subset
train_dataset = Subset(image_dataset, torch.arange(train_size))
test_dataset = Subset(image_dataset, torch.arange(train_size, len(image_dataset)))
print(len(train_dataset), len(test_dataset))
12000 2034
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)
dataloaders_dict = {
"train": train_dataloader,
# "test": test_dataloader # I think the code I'm using uses 'val' instead of 'test',
# so I'm renaming this to 'val' below
"val": test_dataloader
}
criterion = nn.CrossEntropyLoss()
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs, is_inception=(model_name=="inception"))
Epoch 0/9
----------
train Loss: 0.7081 Acc: 0.7117
val Loss: 0.5314 Acc: 0.7940
Epoch 1/9
----------