# Neural Network barely trains at all

I’m trying to train a CNN on CIFAR10 and the loss just stays around 2.3 and the accuracy only ever exceeds 10% by a few points. I simply cannot understand why it seems to not train at all.

``````required_training = True

import os
import time

from typing import Iterable
from dataclasses import dataclass

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchvision import datasets, transforms
import torchvision

import matplotlib.pyplot as plt

if __name__ == '__main__':
class MyModel(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)

def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x

my_model = MyModel()
print(my_model)

def get_mean_std_train_data(data_root):

train_transform = transforms.Compose([transforms.ToTensor()])

# return mean (numpy.ndarray) and std (numpy.ndarray)
mean = np.array([0.5, 0.5, 0.5])
std = np.array([0.5, 0.5, 0.5])

data = train_set.data

r = data[:,:,0]
g = data[:,:,1]
b = data[:,:,2]

mean = [np.mean(r), np.mean(g), np.mean(b)]
std = [np.std(r), np.std(g), np.std(b)]

return mean, std

def get_data(batch_size, data_root, num_workers=1):

try:
mean, std = get_mean_std_train_data(data_root)
assert len(mean) == len(std) == 3
except:
mean = np.array([0.5, 0.5, 0.5])
std = np.array([0.5, 0.5, 0.5])

train_test_transforms = transforms.Compose([
# this re-scales image tensor values between 0-1. image_tensor /= 255
transforms.ToTensor(),
# subtract mean and divide by variance.
transforms.Normalize(mean, std)
])

batch_size=batch_size,
shuffle=True,
num_workers=num_workers
)

batch_size=batch_size,
shuffle=False,
num_workers=num_workers
)

# 4) System Configuration
@dataclass
class SystemConfiguration:
'''
Describes the common system setting needed for reproducible training
'''
seed: int = 42  # seed number to set the state of all random number generators
cudnn_benchmark_enabled: bool = True  # enable CuDNN benchmark for the sake of performance
cudnn_deterministic: bool = True  # make cudnn deterministic (reproducible training)

@dataclass
class TrainingConfiguration:
'''
Describes configuration of the training process
'''
batch_size: int = 128 # amount of data to pass through the network at each forward-backward iteration
epochs_count: int = 2  # number of times the whole dataset will be passed through the network
learning_rate: float = 0.001  # determines the speed of network's weights update

log_interval: int = 100  # how many batches to wait between logging training status
test_interval: int = 1 # how many epochs to wait before another test. Set to 1 to get val loss at each epoch
data_root: str = "../resource/lib/publicdata/images"  # folder to save data
num_workers: int = 10  # number of concurrent processes using to prepare data
device: str = 'cuda'  # device to use for training.

def setup_system(system_config: SystemConfiguration) -> None:
torch.manual_seed(system_config.seed)
if torch.cuda.is_available():
torch.backends.cudnn_benchmark_enabled = system_config.cudnn_benchmark_enabled
torch.backends.cudnn.deterministic = system_config.cudnn_deterministic

def train(
train_config: TrainingConfiguration, model: nn.Module, optimizer: torch.optim.Optimizer,
) -> None:

# change model in training mood
model.train()

# to get batch loss
batch_loss = np.array([])

# to get batch accuracy
batch_acc = np.array([])

for batch_idx, (data, target) in enumerate(train_loader):

# clone target
indx_target = target.clone()
# send data to device (its is medatory if GPU has to be used)
data = data.to(train_config.device)
# send target to device
target = target.to(train_config.device)

# reset parameters gradient to zero

# forward pass to the model
output = model(data)

# cross entropy loss
loss = F.cross_entropy(output, target)

# find gradients w.r.t training parameters
loss.backward()
# Update parameters using gardients
optimizer.step()

batch_loss = np.append(batch_loss, [loss.item()])

# Score to probability using softmax
prob = F.softmax(output, dim=1)

# get the index of the max probability
pred = prob.data.max(dim=1)

# correct prediction
correct = pred.cpu().eq(indx_target).sum()

# accuracy
acc = float(correct) / float(len(data))

batch_acc = np.append(batch_acc, [acc])

if batch_idx % train_config.log_interval == 0 and batch_idx > 0:
print(
'Train Epoch: {} [{}/{}] Loss: {:.6f} Acc: {:.4f}'.format(
epoch_idx, batch_idx * len(data), len(train_loader.dataset), loss.item(), acc
)
)

epoch_loss = batch_loss.mean()
epoch_acc = batch_acc.mean()
return epoch_loss, epoch_acc

def validate(
train_config: TrainingConfiguration,
model: nn.Module,
) -> float:
model.eval()
test_loss = 0
count_corect_predictions = 0
indx_target = target.clone()
data = data.to(train_config.device)

target = target.to(train_config.device)

output = model(data)
# add loss for each mini batch
test_loss += F.cross_entropy(output, target).item()

# Score to probability using softmax
prob = F.softmax(output, dim=1)

# get the index of the max probability
pred = prob.data.max(dim=1)

count_corect_predictions += pred.cpu().eq(indx_target).sum()

# average over number of mini-batches

# average over number of dataset
accuracy = 100. * count_corect_predictions / len(test_loader.dataset)

print(
'\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
)
)
return test_loss, accuracy/100.0

def save_model(model, device, model_dir='models', model_file_name='cifar10_cnn_model.pt'):

if not os.path.exists(model_dir):
os.makedirs(model_dir)

model_path = os.path.join(model_dir, model_file_name)

# make sure you transfer the model to cpu.
if device == 'cuda':
model.to('cpu')

# save the state_dict
torch.save(model.state_dict(), model_path)

if device == 'cuda':
model.to('cuda')

return

def main(system_configuration=SystemConfiguration(), training_configuration=TrainingConfiguration()):

# system configuration
setup_system(system_configuration)

# batch size
batch_size_to_set = training_configuration.batch_size
# num_workers
num_workers_to_set = training_configuration.num_workers
# epochs
epoch_num_to_set = training_configuration.epochs_count

# if GPU is available use training config,
# else lowers batch_size, num_workers and epochs count
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
num_workers_to_set = 2

batch_size=training_configuration.batch_size,
data_root=training_configuration.data_root,
num_workers=num_workers_to_set
)

# Update training configuration
training_configuration = TrainingConfiguration(
device=device,
num_workers=num_workers_to_set
)

# initiate model
model = MyModel()

# send model to device (GPU/CPU)
model.to(training_configuration.device)

# optimizer
optimizer = optim.SGD(
model.parameters(),
lr=training_configuration.learning_rate,
momentum = 0.9
)

best_loss = torch.tensor(np.inf)
best_accuracy = torch.tensor(0)

# epoch train/test loss
epoch_train_loss = np.array([])
epoch_test_loss = np.array([])

# epch train/test accuracy
epoch_train_acc = np.array([])
epoch_test_acc = np.array([])

# trainig time measurement
t_begin = time.time()
for epoch in range(training_configuration.epochs_count):

train_loss, train_acc = train(training_configuration, model, optimizer, train_loader, epoch)

epoch_train_loss = np.append(epoch_train_loss, [train_loss])

epoch_train_acc = np.append(epoch_train_acc, [train_acc])

elapsed_time = time.time() - t_begin
speed_epoch = elapsed_time / (epoch + 1)
eta = speed_epoch * training_configuration.epochs_count - elapsed_time

print(
"Elapsed {:.2f}s, {:.2f} s/epoch, {:.2f} s/batch, ets {:.2f}s".format(
elapsed_time, speed_epoch, speed_batch, eta
)
)

if epoch % training_configuration.test_interval == 0:
current_loss, current_accuracy = validate(training_configuration, model, test_loader)

epoch_test_loss = np.append(epoch_test_loss, [current_loss])

epoch_test_acc = np.append(epoch_test_acc, [current_accuracy])

if current_loss < best_loss:
best_loss = current_loss

if current_accuracy > best_accuracy:
best_accuracy = current_accuracy
print('Accuracy improved, saving the model.\n')
save_model(model, device)

print("Total time: {:.2f}, Best Loss: {:.3f}, Best Accuracy: {:.3f}".format(time.time() - t_begin, best_loss,
best_accuracy))

return model, epoch_train_loss, epoch_train_acc, epoch_test_loss, epoch_test_acc

if required_training:
model, epoch_train_loss, epoch_train_acc, epoch_test_loss, epoch_test_acc = main()``````