Neural Network barely trains at all

Duncan_Wood · July 10, 2021, 7:51pm

I’m trying to train a CNN on CIFAR10 and the loss just stays around 2.3 and the accuracy only ever exceeds 10% by a few points. I simply cannot understand why it seems to not train at all.

required_training = True

import os
import time

from typing import Iterable
from dataclasses import dataclass

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchvision import datasets, transforms
import torchvision

import matplotlib.pyplot as plt

if __name__ == '__main__':
    class MyModel(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Conv2d(3, 6, 5)
            self.pool = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(6, 16, 5)
            self.fc1 = nn.Linear(16 * 5 * 5, 120)
            self.fc2 = nn.Linear(120, 84)
            self.fc3 = nn.Linear(84, 10)

        def forward(self, x):
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = torch.flatten(x, 1) # flatten all dimensions except batch
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    my_model = MyModel()
    print(my_model)

    def get_mean_std_train_data(data_root):
    
        train_transform = transforms.Compose([transforms.ToTensor()])
        train_set = datasets.CIFAR10(root=data_root, train=True, download=False, transform=train_transform)
    
        # return mean (numpy.ndarray) and std (numpy.ndarray)
        mean = np.array([0.5, 0.5, 0.5])
        std = np.array([0.5, 0.5, 0.5])
    
        data = train_set.data

        r = data[:,:,0]
        g = data[:,:,1]
        b = data[:,:,2]

        mean = [np.mean(r), np.mean(g), np.mean(b)]
        std = [np.std(r), np.std(g), np.std(b)]
    
        return mean, std


    def get_data(batch_size, data_root, num_workers=1):
    
    
        try:
            mean, std = get_mean_std_train_data(data_root)
            assert len(mean) == len(std) == 3
        except:
            mean = np.array([0.5, 0.5, 0.5])
            std = np.array([0.5, 0.5, 0.5])
        
    
        train_test_transforms = transforms.Compose([
            # this re-scales image tensor values between 0-1. image_tensor /= 255
            transforms.ToTensor(),
            # subtract mean and divide by variance.
            transforms.Normalize(mean, std)
        ])
    
        # train dataloader
        train_loader = torch.utils.data.DataLoader(
            datasets.CIFAR10(root=data_root, train=True, download=False, transform=train_test_transforms),
            batch_size=batch_size,
            shuffle=True,
            num_workers=num_workers
        )
    
        # test dataloader
        test_loader = torch.utils.data.DataLoader(
            datasets.CIFAR10(root=data_root, train=False, download=False, transform=train_test_transforms),
            batch_size=batch_size,
            shuffle=False,
            num_workers=num_workers
        )
        return train_loader, test_loader


    # 4) System Configuration
    @dataclass
    class SystemConfiguration:
        '''
        Describes the common system setting needed for reproducible training
        '''
        seed: int = 42  # seed number to set the state of all random number generators
        cudnn_benchmark_enabled: bool = True  # enable CuDNN benchmark for the sake of performance
        cudnn_deterministic: bool = True  # make cudnn deterministic (reproducible training)


    @dataclass
    class TrainingConfiguration:
        '''
        Describes configuration of the training process
        '''
        batch_size: int = 128 # amount of data to pass through the network at each forward-backward iteration
        epochs_count: int = 2  # number of times the whole dataset will be passed through the network
        learning_rate: float = 0.001  # determines the speed of network's weights update
        
        log_interval: int = 100  # how many batches to wait between logging training status
        test_interval: int = 1 # how many epochs to wait before another test. Set to 1 to get val loss at each epoch
        data_root: str = "../resource/lib/publicdata/images"  # folder to save data
        num_workers: int = 10  # number of concurrent processes using to prepare data
        device: str = 'cuda'  # device to use for training.

    def setup_system(system_config: SystemConfiguration) -> None:
        torch.manual_seed(system_config.seed)
        if torch.cuda.is_available():
            torch.backends.cudnn_benchmark_enabled = system_config.cudnn_benchmark_enabled
            torch.backends.cudnn.deterministic = system_config.cudnn_deterministic

    def train(
        train_config: TrainingConfiguration, model: nn.Module, optimizer: torch.optim.Optimizer,
        train_loader: torch.utils.data.DataLoader, epoch_idx: int
    ) -> None:
    
        # change model in training mood
        model.train()
    
        # to get batch loss
        batch_loss = np.array([])
    
        # to get batch accuracy
        batch_acc = np.array([])
        
        for batch_idx, (data, target) in enumerate(train_loader):
        
            # clone target
            indx_target = target.clone()
            # send data to device (its is medatory if GPU has to be used)
            data = data.to(train_config.device)
            # send target to device
            target = target.to(train_config.device)

            # reset parameters gradient to zero
            optimizer.zero_grad(set_to_none=True)
        
            # forward pass to the model
            output = model(data)
        
            # cross entropy loss
            loss = F.cross_entropy(output, target)
        
            # find gradients w.r.t training parameters
            loss.backward()
            # Update parameters using gardients
            optimizer.step()
        
            batch_loss = np.append(batch_loss, [loss.item()])
        
            # Score to probability using softmax
            prob = F.softmax(output, dim=1)
            
            # get the index of the max probability
            pred = prob.data.max(dim=1)[1]  
                        
            # correct prediction
            correct = pred.cpu().eq(indx_target).sum()
            
            # accuracy
            acc = float(correct) / float(len(data))
        
            batch_acc = np.append(batch_acc, [acc])

            if batch_idx % train_config.log_interval == 0 and batch_idx > 0:              
                print(
                    'Train Epoch: {} [{}/{}] Loss: {:.6f} Acc: {:.4f}'.format(
                        epoch_idx, batch_idx * len(data), len(train_loader.dataset), loss.item(), acc
                    )
                )
            
        epoch_loss = batch_loss.mean()
        epoch_acc = batch_acc.mean()
        return epoch_loss, epoch_acc


    def validate(
        train_config: TrainingConfiguration,
        model: nn.Module,
        test_loader: torch.utils.data.DataLoader,
    ) -> float:
        model.eval()
        torch.no_grad()
        test_loss = 0
        count_corect_predictions = 0
        for data, target in test_loader:
            indx_target = target.clone()
            data = data.to(train_config.device)
        
            target = target.to(train_config.device)
        
            output = model(data)
            # add loss for each mini batch
            test_loss += F.cross_entropy(output, target).item()
        
            # Score to probability using softmax
            prob = F.softmax(output, dim=1)
        
            # get the index of the max probability
            pred = prob.data.max(dim=1)[1] 
        
            # add correct prediction count
            count_corect_predictions += pred.cpu().eq(indx_target).sum()

        # average over number of mini-batches
        test_loss = test_loss / len(test_loader)  
    
        # average over number of dataset
        accuracy = 100. * count_corect_predictions / len(test_loader.dataset)
    
        print(
            '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
                test_loss, count_corect_predictions, len(test_loader.dataset), accuracy
            )
        )
        return test_loss, accuracy/100.0


    def save_model(model, device, model_dir='models', model_file_name='cifar10_cnn_model.pt'):

        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        model_path = os.path.join(model_dir, model_file_name)

        # make sure you transfer the model to cpu.
        if device == 'cuda':
            model.to('cpu')

        # save the state_dict
        torch.save(model.state_dict(), model_path)
    
        if device == 'cuda':
            model.to('cuda')
    
        return


    def main(system_configuration=SystemConfiguration(), training_configuration=TrainingConfiguration()):
    
        # system configuration
        setup_system(system_configuration)

        # batch size
        batch_size_to_set = training_configuration.batch_size
        # num_workers
        num_workers_to_set = training_configuration.num_workers
        # epochs
        epoch_num_to_set = training_configuration.epochs_count

        # if GPU is available use training config, 
        # else lowers batch_size, num_workers and epochs count
        if torch.cuda.is_available():
            device = "cuda"
        else:
            device = "cpu"
            num_workers_to_set = 2

        # data loader
        train_loader, test_loader = get_data(
            batch_size=training_configuration.batch_size,
            data_root=training_configuration.data_root,
            num_workers=num_workers_to_set
        )
    
        # Update training configuration
        training_configuration = TrainingConfiguration(
            device=device,
            num_workers=num_workers_to_set
        )

        # initiate model
        model = MyModel()
        
        # send model to device (GPU/CPU)
        model.to(training_configuration.device)

        # optimizer
        optimizer = optim.SGD(
            model.parameters(),
            lr=training_configuration.learning_rate,
            momentum = 0.9
        )

        best_loss = torch.tensor(np.inf)
        best_accuracy = torch.tensor(0)
    
        # epoch train/test loss
        epoch_train_loss = np.array([])
        epoch_test_loss = np.array([])
    
        # epch train/test accuracy
        epoch_train_acc = np.array([])
        epoch_test_acc = np.array([])
    
        # trainig time measurement
        t_begin = time.time()
        for epoch in range(training_configuration.epochs_count):
        
            train_loss, train_acc = train(training_configuration, model, optimizer, train_loader, epoch)
        
            epoch_train_loss = np.append(epoch_train_loss, [train_loss])
        
            epoch_train_acc = np.append(epoch_train_acc, [train_acc])

            elapsed_time = time.time() - t_begin
            speed_epoch = elapsed_time / (epoch + 1)
            speed_batch = speed_epoch / len(train_loader)
            eta = speed_epoch * training_configuration.epochs_count - elapsed_time
        
            print(
                "Elapsed {:.2f}s, {:.2f} s/epoch, {:.2f} s/batch, ets {:.2f}s".format(
                    elapsed_time, speed_epoch, speed_batch, eta
                )
            )

            if epoch % training_configuration.test_interval == 0:
                current_loss, current_accuracy = validate(training_configuration, model, test_loader)
            
                epoch_test_loss = np.append(epoch_test_loss, [current_loss])
        
                epoch_test_acc = np.append(epoch_test_acc, [current_accuracy])
            
                if current_loss < best_loss:
                    best_loss = current_loss
            
                if current_accuracy > best_accuracy:
                    best_accuracy = current_accuracy
                    print('Accuracy improved, saving the model.\n')
                    save_model(model, device)

            
        print("Total time: {:.2f}, Best Loss: {:.3f}, Best Accuracy: {:.3f}".format(time.time() - t_begin, best_loss, 
                                                                                best_accuracy))
    
        return model, epoch_train_loss, epoch_train_acc, epoch_test_loss, epoch_test_acc


    if required_training:
        model, epoch_train_loss, epoch_train_acc, epoch_test_loss, epoch_test_acc = main()