I’m trying to train a CNN on CIFAR10 and the loss just stays around 2.3 and the accuracy only ever exceeds 10% by a few points. I simply cannot understand why it seems to not train at all.
required_training = True
import os
import time
from typing import Iterable
from dataclasses import dataclass
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
import torchvision
import matplotlib.pyplot as plt
if __name__ == '__main__':
class MyModel(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
my_model = MyModel()
print(my_model)
def get_mean_std_train_data(data_root):
train_transform = transforms.Compose([transforms.ToTensor()])
train_set = datasets.CIFAR10(root=data_root, train=True, download=False, transform=train_transform)
# return mean (numpy.ndarray) and std (numpy.ndarray)
mean = np.array([0.5, 0.5, 0.5])
std = np.array([0.5, 0.5, 0.5])
data = train_set.data
r = data[:,:,0]
g = data[:,:,1]
b = data[:,:,2]
mean = [np.mean(r), np.mean(g), np.mean(b)]
std = [np.std(r), np.std(g), np.std(b)]
return mean, std
def get_data(batch_size, data_root, num_workers=1):
try:
mean, std = get_mean_std_train_data(data_root)
assert len(mean) == len(std) == 3
except:
mean = np.array([0.5, 0.5, 0.5])
std = np.array([0.5, 0.5, 0.5])
train_test_transforms = transforms.Compose([
# this re-scales image tensor values between 0-1. image_tensor /= 255
transforms.ToTensor(),
# subtract mean and divide by variance.
transforms.Normalize(mean, std)
])
# train dataloader
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10(root=data_root, train=True, download=False, transform=train_test_transforms),
batch_size=batch_size,
shuffle=True,
num_workers=num_workers
)
# test dataloader
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10(root=data_root, train=False, download=False, transform=train_test_transforms),
batch_size=batch_size,
shuffle=False,
num_workers=num_workers
)
return train_loader, test_loader
# 4) System Configuration
@dataclass
class SystemConfiguration:
'''
Describes the common system setting needed for reproducible training
'''
seed: int = 42 # seed number to set the state of all random number generators
cudnn_benchmark_enabled: bool = True # enable CuDNN benchmark for the sake of performance
cudnn_deterministic: bool = True # make cudnn deterministic (reproducible training)
@dataclass
class TrainingConfiguration:
'''
Describes configuration of the training process
'''
batch_size: int = 128 # amount of data to pass through the network at each forward-backward iteration
epochs_count: int = 2 # number of times the whole dataset will be passed through the network
learning_rate: float = 0.001 # determines the speed of network's weights update
log_interval: int = 100 # how many batches to wait between logging training status
test_interval: int = 1 # how many epochs to wait before another test. Set to 1 to get val loss at each epoch
data_root: str = "../resource/lib/publicdata/images" # folder to save data
num_workers: int = 10 # number of concurrent processes using to prepare data
device: str = 'cuda' # device to use for training.
def setup_system(system_config: SystemConfiguration) -> None:
torch.manual_seed(system_config.seed)
if torch.cuda.is_available():
torch.backends.cudnn_benchmark_enabled = system_config.cudnn_benchmark_enabled
torch.backends.cudnn.deterministic = system_config.cudnn_deterministic
def train(
train_config: TrainingConfiguration, model: nn.Module, optimizer: torch.optim.Optimizer,
train_loader: torch.utils.data.DataLoader, epoch_idx: int
) -> None:
# change model in training mood
model.train()
# to get batch loss
batch_loss = np.array([])
# to get batch accuracy
batch_acc = np.array([])
for batch_idx, (data, target) in enumerate(train_loader):
# clone target
indx_target = target.clone()
# send data to device (its is medatory if GPU has to be used)
data = data.to(train_config.device)
# send target to device
target = target.to(train_config.device)
# reset parameters gradient to zero
optimizer.zero_grad(set_to_none=True)
# forward pass to the model
output = model(data)
# cross entropy loss
loss = F.cross_entropy(output, target)
# find gradients w.r.t training parameters
loss.backward()
# Update parameters using gardients
optimizer.step()
batch_loss = np.append(batch_loss, [loss.item()])
# Score to probability using softmax
prob = F.softmax(output, dim=1)
# get the index of the max probability
pred = prob.data.max(dim=1)[1]
# correct prediction
correct = pred.cpu().eq(indx_target).sum()
# accuracy
acc = float(correct) / float(len(data))
batch_acc = np.append(batch_acc, [acc])
if batch_idx % train_config.log_interval == 0 and batch_idx > 0:
print(
'Train Epoch: {} [{}/{}] Loss: {:.6f} Acc: {:.4f}'.format(
epoch_idx, batch_idx * len(data), len(train_loader.dataset), loss.item(), acc
)
)
epoch_loss = batch_loss.mean()
epoch_acc = batch_acc.mean()
return epoch_loss, epoch_acc
def validate(
train_config: TrainingConfiguration,
model: nn.Module,
test_loader: torch.utils.data.DataLoader,
) -> float:
model.eval()
torch.no_grad()
test_loss = 0
count_corect_predictions = 0
for data, target in test_loader:
indx_target = target.clone()
data = data.to(train_config.device)
target = target.to(train_config.device)
output = model(data)
# add loss for each mini batch
test_loss += F.cross_entropy(output, target).item()
# Score to probability using softmax
prob = F.softmax(output, dim=1)
# get the index of the max probability
pred = prob.data.max(dim=1)[1]
# add correct prediction count
count_corect_predictions += pred.cpu().eq(indx_target).sum()
# average over number of mini-batches
test_loss = test_loss / len(test_loader)
# average over number of dataset
accuracy = 100. * count_corect_predictions / len(test_loader.dataset)
print(
'\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, count_corect_predictions, len(test_loader.dataset), accuracy
)
)
return test_loss, accuracy/100.0
def save_model(model, device, model_dir='models', model_file_name='cifar10_cnn_model.pt'):
if not os.path.exists(model_dir):
os.makedirs(model_dir)
model_path = os.path.join(model_dir, model_file_name)
# make sure you transfer the model to cpu.
if device == 'cuda':
model.to('cpu')
# save the state_dict
torch.save(model.state_dict(), model_path)
if device == 'cuda':
model.to('cuda')
return
def main(system_configuration=SystemConfiguration(), training_configuration=TrainingConfiguration()):
# system configuration
setup_system(system_configuration)
# batch size
batch_size_to_set = training_configuration.batch_size
# num_workers
num_workers_to_set = training_configuration.num_workers
# epochs
epoch_num_to_set = training_configuration.epochs_count
# if GPU is available use training config,
# else lowers batch_size, num_workers and epochs count
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
num_workers_to_set = 2
# data loader
train_loader, test_loader = get_data(
batch_size=training_configuration.batch_size,
data_root=training_configuration.data_root,
num_workers=num_workers_to_set
)
# Update training configuration
training_configuration = TrainingConfiguration(
device=device,
num_workers=num_workers_to_set
)
# initiate model
model = MyModel()
# send model to device (GPU/CPU)
model.to(training_configuration.device)
# optimizer
optimizer = optim.SGD(
model.parameters(),
lr=training_configuration.learning_rate,
momentum = 0.9
)
best_loss = torch.tensor(np.inf)
best_accuracy = torch.tensor(0)
# epoch train/test loss
epoch_train_loss = np.array([])
epoch_test_loss = np.array([])
# epch train/test accuracy
epoch_train_acc = np.array([])
epoch_test_acc = np.array([])
# trainig time measurement
t_begin = time.time()
for epoch in range(training_configuration.epochs_count):
train_loss, train_acc = train(training_configuration, model, optimizer, train_loader, epoch)
epoch_train_loss = np.append(epoch_train_loss, [train_loss])
epoch_train_acc = np.append(epoch_train_acc, [train_acc])
elapsed_time = time.time() - t_begin
speed_epoch = elapsed_time / (epoch + 1)
speed_batch = speed_epoch / len(train_loader)
eta = speed_epoch * training_configuration.epochs_count - elapsed_time
print(
"Elapsed {:.2f}s, {:.2f} s/epoch, {:.2f} s/batch, ets {:.2f}s".format(
elapsed_time, speed_epoch, speed_batch, eta
)
)
if epoch % training_configuration.test_interval == 0:
current_loss, current_accuracy = validate(training_configuration, model, test_loader)
epoch_test_loss = np.append(epoch_test_loss, [current_loss])
epoch_test_acc = np.append(epoch_test_acc, [current_accuracy])
if current_loss < best_loss:
best_loss = current_loss
if current_accuracy > best_accuracy:
best_accuracy = current_accuracy
print('Accuracy improved, saving the model.\n')
save_model(model, device)
print("Total time: {:.2f}, Best Loss: {:.3f}, Best Accuracy: {:.3f}".format(time.time() - t_begin, best_loss,
best_accuracy))
return model, epoch_train_loss, epoch_train_acc, epoch_test_loss, epoch_test_acc
if required_training:
model, epoch_train_loss, epoch_train_acc, epoch_test_loss, epoch_test_acc = main()