Greetings,
I started using a GPU for the first time. For a test I train a small CNN on CIFAR10 both with CPU and with GPU. The speedup is only 4 to 6 times, depending on the compared hardware. I expected way more as my CPU ( Intel i7-8650U) is not that great, whereas the used GPU is a Tesla V100-SXM2-16GB.
I thought that maybe there is something wrong with the way I do things, so I would like to get my code checked. As a summary: The parts that I changed in my CPU-code to make it use the GPU:
1.Model: My model gets device variable (assumed to hold GPU) since its test( )
method sends (features, targets)
to device batch by batch.
2. Train routine: My training routine gets device variable in order to send (features, targets)
to GPU batch by batch
3. Main: Check whether CUDA is available, set device to GPU, initialize model and pass device, send model to device, call train routine with passing device.
Best,
PIF
import numpy as np
import torch.nn as nn
import torch
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
import time
### MODEL ###
class ConvNet(nn.Module):
def __init__(self, dev="cpu"):
super(ConvNet, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=(1,1))
self.max1 = nn.MaxPool2d(kernel_size=2, stride=(2,2))
self.conv2 = nn.Conv2d(32, 64, kernel_size=(3,3), stride=(1,1))
self.max2 = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))
self.fc1 = nn.Linear(2304, 500)
self.fc2 = nn.Linear(500,10)
self.dev = dev
def forward(self, x):
x = self.conv1(x)
x = torch.relu(x)
x = self.max1(x)
x = self.conv2(x)
x = torch.relu(x)
x = self.max2(x)
x = x.view(x.size(0), -1)
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def test(self, data_loader, criterion): ## returns network output to given input.
self.eval()
loss = 0
accu = 0
with torch.no_grad():
for (features, targets) in data_loader:
features, targets = features.to(self.dev), targets.to(self.dev) # SEND TO DEVICE
output = self.forward(features)
## Add Losses of all Samples in Batch.
loss += criterion(output, targets)
_, predicted = torch.max(output,1)
accu += torch.sum(predicted == targets) / len(targets)
## return summed batch losses and -accuracies.
return (loss, accu)
### train function
def train_model_classi(model, optimizer, criterion, epochs, trainloader, scheduler=None, testloader=None, eval_freq=0, dev="cpu"):
# training routine for N class classification problem.
loss_train = np.zeros(epochs+1)
accu_train = np.zeros(epochs+1)
loss_test = 0 # dummy in case no testloader was passed
accu_test = 0
# initial train loss and accuracy values
(loss_train[0], accu_train[0]) = model.test(trainloader, criterion)
# initial test loss and accu values if necessary
testmode = False
if testloader is not None:
assert isinstance(eval_freq ,int) and eval_freq > 0, 'eval_freq must be positive integer!'
testmode = True
loss_test = np.zeros(epochs//eval_freq+1)
accu_test = np.zeros(epochs//eval_freq+1)
(loss_test[0], accu_test[0]) = model.test(testloader, criterion)
t = 1 # index count for loss_test and accu_test (see below)
# training
print("starting training...")
start_time = time.time()
for epoch in np.arange(1,epochs+1):
print("Epoch ", epoch)
for (batchidx, (features, targets)) in enumerate(trainloader):
features, targets = features.to(dev), targets.to(dev) # SEND TO DEVICE
output = model.forward(features)
optimizer.zero_grad()
loss = criterion(output, targets)
loss_train[epoch] += loss
_, predicted = torch.max(output,1)
accu_train[epoch] += torch.sum(predicted == targets) / len(targets)
loss.backward()
optimizer.step()
if scheduler is not None: scheduler.step()
if testmode==True and epoch % eval_freq==0: # obtain test values
(loss_test[t], accu_test[t]) = model.test(testloader, criterion)
t += 1
end_time = time.time()
print("Training took {} seconds, i.e {} minutes, with {} seconds per epoch!"
.format(end_time-start_time, (end_time-start_time)/60, (end_time-start_time)/epochs))
return ( loss_train/len(trainloader), accu_train/len(trainloader),
loss_test/len(testloader), accu_test/len(testloader) )
##### MAIN #####
#%% Parameters
eval_freq = 2
sched_step_size = 5
epochs = 10
dev = "cuda:0" if torch.cuda.is_available() else "cpu" # use GPU if possible
#SGD Params
bs = 128
eta = 0.01
mom = 0.9
#%% Load CIFAR 10
transfos = transforms.Compose([transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])])
train = datasets.CIFAR10(".", train = True, download = True, transform = transfos)
test = datasets.CIFAR10(".", train = False, download = True, transform = transfos)
batches_train = torch.utils.data.DataLoader(train, batch_size=bs, shuffle=True, num_workers=1, drop_last=True)
batches_test = torch.utils.data.DataLoader(test, batch_size=bs, shuffle=True, num_workers=1, drop_last=True)
#%% Model Training
testmodel = ConvNet(dev) # ConvNet.test() needs device
testmodel.to(dev) # send to DEVICE
criterion = F.nll_loss
optimizer = optim.SGD(testmodel.parameters(), lr=eta, weight_decay=0.0005, momentum=mom)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=sched_step_size, gamma=0.1, last_epoch=-1, verbose=False)
(loss_train, accu_train, loss_test, accu_test) = train_model_classi(testmodel, optimizer, criterion, epochs, trainloader=batches_train,
scheduler=scheduler, testloader=batches_test, eval_freq=eval_freq, dev=dev)
## PLOT STUFF
######