Training ELU network with pytorch ignite

Hello, I am trying to train this network on pytorch ignite but the loss doesn’t increase a single point after more than 20 epochs…

I am working with Deep Neural Networks and I’ve been trying to emulate ELU network

class CIFARClassifierELUPaper(nn.Module):
    def __init__(self, num_classes=100):
        super(CIFARClassifierELUPaper, self).__init__()
        self.main = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 384, 3, padding=1), # 32
            nn.ELU(),
            nn.MaxPool2d(2, 2), # 16
            # Block 2
            nn.Conv2d(384, 384, 1, padding=0), # 16
            nn.ELU(),
            nn.Conv2d(384, 384, 2, padding=1), # 17
            nn.ELU(),
            nn.Conv2d(384, 640, 2, padding=1), # 18
            nn.ELU(),
            nn.MaxPool2d(2, 2), # 9
            nn.Dropout2d(0.1),
            # Block 3
            nn.Conv2d(640, 640, 1, padding=0), # 9
            nn.ELU(),
            nn.Conv2d(640, 768, 2, padding=1), # 10
            nn.ELU(),
            nn.Conv2d(768, 768, 2, padding=1), # 11
            nn.ELU(),
            nn.Conv2d(768, 768, 2, padding=1), # 12
            nn.ELU(),
            nn.MaxPool2d(2, 2), # 6
            nn.Dropout2d(0.2),
            # Block 4
            nn.Conv2d(768, 768, 1, padding=0), # 6
            nn.ELU(),
            nn.Conv2d(768, 896, 2, padding=1), # 7
            nn.ELU(),
            nn.Conv2d(896, 896, 2, padding=1), # 8
            nn.ELU(),
            nn.MaxPool2d(2, 2), # 4
            nn.Dropout2d(0.3),
            # Block 5num_classes
            nn.Conv2d(896, 896, 1, padding=0), # 4
            nn.ELU(),
            nn.Conv2d(896, 1024, 2, padding=1), # 5
            nn.ELU(),
            nn.Conv2d(1024, 1024, 2, padding=1), # 6
            nn.ELU(),
            nn.MaxPool2d(2, 2), # 3
            nn.Dropout2d(0.4),
            # Block 6
            nn.Conv2d(1024, 1024, 1, padding=0), # 3
            nn.ELU(),
            nn.Conv2d(1024, 1152, 2, padding=0), # 2
            nn.ELU(),
            nn.MaxPool2d(2, 2), # 1
            nn.Dropout2d(0.5),
            # Block 7
            nn.Conv2d(1152, 1152, 1, padding=0), #  1
            nn.ELU(),
            nn.Dropout2d(0.0),
            # Block 8
            nn.Conv2d(1152, num_classes, 1, padding=0), # 1
        )
 
    def forward(self, x):
        return self.main(x).view(x.size(0),-1)

model=CIFARClassifierELUPaper()

I am trying to train CIFAR 100 and I am training with:

SGD
learning rate 0.1
ZCA whitening and all the others preprocessing (Random horizontal flip and randomcrops) steps

but both eval and train losses doesn’t increase or decrease ever (I tested with 20 epochs of 80)…

This are the metrics
] The metrics on training are {‘Loss’: 4.6055209875106815, ‘Accuracy’: 0.0082, ‘Precision’: 8.2e-05, ‘Recall’: 0.01, ‘Top-5 Accuracy’: 0.0474}
The metrics on testing are {‘Loss’: 4.6053833532333375, ‘Accuracy’: 0.01, ‘Precision’: 0.0001, ‘Recall’: 0.01, ‘Top-5 Accuracy’: 0.05}

Should I wait longer or there is something wrong with my network? I trained this before getting 64% but on the other one the only difference is that I added batch normalization…but I want to train from the begging and try to get the 73% …Also the the publication suggest that BN is not necessary, so it shouldn’t be a problem right?

Hi @Tanya_Boone

could you please share complete training script to be able to run it locally or in Colab ?

Probably, input data normalization and high learning rate may be responsible for that …

Sure,

from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
import os
import numpy as np
import torch.optim as optim
import random
import torch
import torch.nn as nn
from torch.autograd import Variable
from collections import OrderedDict
import math
from contextlib import redirect_stdout
from itertools import chain
import torch.nn.functional as F
from torch.optim.lr_scheduler import ExponentialLR,MultiStepLR
from torchvision.datasets.cifar import CIFAR100, CIFAR10
from torchvision.transforms import Compose, RandomCrop, Pad, RandomHorizontalFlip, Resize, RandomAffine,LinearTransformation
from torchvision.transforms import ToTensor, Normalize
from torch.utils.data import Subset
from PIL.Image import BICUBIC
from torch.utils.data import DataLoader
import matplotlib.pylab as plt
from apex import amp
from ignite.utils import convert_tensor
from ignite.engine import Engine, Events, create_supervised_evaluator
from ignite.metrics import RunningAverage, Accuracy, Precision, Recall, Loss, TopKCategoricalAccuracy
#from ignite.contrib.handlers import TensorboardLogger
#from ignite.contrib.handlers.tensorboard_logger import OutputHandler, OptimizerParamsHandler
from datetime import datetime
from ignite.contrib.handlers import CustomPeriodicEvent
import logging
from ignite.handlers import ModelCheckpoint, EarlyStopping, TerminateOnNan
from ignite.engine import Engine, Events
from ignite.handlers import Checkpoint, DiskSaver
from ignite.contrib.handlers import ProgressBar
import dill
import pickle 
from ignite.engine import create_supervised_trainer
assert torch.cuda.is_available()
assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
torch.backends.cudnn.benchmark = True

device = "cuda"

path = "."
image_size = 32

train_transform = Compose([
    RandomCrop(32),
    RandomHorizontalFlip(),
    Pad(4),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    #LinearTransformation(transformation_matrix, mean_vector), 
 ])


test_transform = Compose([ 
    Pad(4),   
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])

train_dataset = CIFAR100(root=path, train=True, transform=train_transform, download=True)
test_dataset = CIFAR100(root=path, train=False, transform=test_transform, download=True)

train_eval_indices = [random.randint(0, len(train_dataset) - 1) for i in range(len(test_dataset))]
train_eval_dataset = Subset(train_dataset, train_eval_indices)


batch_size = 100
num_workers = 10

train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, 
                          shuffle=True, drop_last=True, pin_memory=True)

test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, 
                         shuffle=False, drop_last=False, pin_memory=True)

eval_train_loader = DataLoader(train_eval_dataset, batch_size=batch_size, num_workers=num_workers, 
                               shuffle=False, drop_last=False, pin_memory=True)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(17)

class CIFARClassifierELUPaper(nn.Module):

    def __init__(self, num_classes=100):
        super(CIFARClassifierELUPaper, self).__init__()
        
        self.main = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 384, 3, padding=1), # 32
            nn.ELU(),
            #nn.BatchNorm2d(384,eps=1e-4,momentum=0.1),
            nn.MaxPool2d(2, 2), # 16
            # Block 2
            nn.Conv2d(384, 384, 1, padding=0), # 16
            nn.ELU(),
           # nn.BatchNorm2d(384,eps=1e-4,momentum=0.1),
            nn.Conv2d(384, 384, 2, padding=1), # 17
            nn.ELU(),
            #nn.BatchNorm2d(384,eps=1e-4,momentum=0.1),
            nn.Conv2d(384, 640, 2, padding=1), # 18
            nn.ELU(),
            #nn.BatchNorm2d(640,eps=1e-4,momentum=0.1),
            nn.MaxPool2d(2, 2), # 9
            nn.Dropout2d(0.1),
            # Block 3
            nn.Conv2d(640, 640, 1, padding=0), # 9
            nn.ELU(),
           # nn.BatchNorm2d(640,eps=1e-4,momentum=0.1),
            nn.Conv2d(640, 768, 2, padding=1), # 10
            nn.ELU(),
            #nn.BatchNorm2d(768,eps=1e-4,momentum=0.1),
            nn.Conv2d(768, 768, 2, padding=1), # 11
            nn.ELU(),
            #nn.BatchNorm2d(768,eps=1e-4,momentum=0.1),
            nn.Conv2d(768, 768, 2, padding=1), # 12
            nn.ELU(),
            #nn.BatchNorm2d(768,eps=1e-4,momentum=0.1),
            nn.MaxPool2d(2, 2), # 6
            nn.Dropout2d(0.2),
            # Block 4
            nn.Conv2d(768, 768, 1, padding=0), # 6
            nn.ELU(),
            #nn.BatchNorm2d(768,eps=1e-4,momentum=0.1),
            nn.Conv2d(768, 896, 2, padding=1), # 7
            nn.ELU(),
            #nn.BatchNorm2d(896,eps=1e-4,momentum=0.1),
            nn.Conv2d(896, 896, 2, padding=1), # 8
            nn.ELU(),
            nn.MaxPool2d(2, 2), # 4
            nn.Dropout2d(0.3),
            # Block 5num_classes
            nn.Conv2d(896, 896, 1, padding=0), # 4
            nn.ELU(),
           # nn.BatchNorm2d(896,eps=1e-4,momentum=0.1),
            nn.Conv2d(896, 1024, 2, padding=1), # 5
            nn.ELU(),
            #nn.BatchNorm2d(1024,eps=1e-4,momentum=0.1),
            nn.Conv2d(1024, 1024, 2, padding=1), # 6
            nn.ELU(),
            #nn.BatchNorm2d(1024,eps=1e-4,momentum=0.1),
            nn.MaxPool2d(2, 2), # 3
            nn.Dropout2d(0.4),
            # Block 6
            nn.Conv2d(1024, 1024, 1, padding=0), # 3
            nn.ELU(),
            #nn.BatchNorm2d(1024,eps=1e-4,momentum=0.1),
            nn.Conv2d(1024, 1152, 2, padding=0), # 2
            nn.ELU(),
            #nn.BatchNorm2d(1152,eps=1e-4,momentum=0.1),
            nn.MaxPool2d(2, 2), # 1
            nn.Dropout2d(0.5),
            # Block 7
            nn.Conv2d(1152, 1152, 1, padding=0), #  1
            nn.ELU(),
            #nn.BatchNorm2d(1152,eps=1e-4,momentum=0.1),
            nn.Dropout2d(0.0),
            # Block 8
            nn.Conv2d(1152, num_classes, 1, padding=0) # 1
        )
 
    def forward(self, x):
        return self.main(x).view(x.size(0),-1)

model=CIFARClassifierELUPaper()

print(model)
    
print_num_params(model)

def ZCA_whitening(data):
    
    data=data.view(data.size(0),-1)
    X_norm=data/255
    X_norm.mean(axis=0).shape
    cov = np.cov(X_norm.cpu(), rowvar=True)   
    U,S,V = np.linalg.svd(cov)
    epsilon = 0.1
    X_ZCA = U.dot(np.diag(1.0/np.sqrt(S + epsilon))).dot(U.T).dot(X_norm.cpu())   
    X_ZCA_rescaled = (X_ZCA - X_ZCA.min()) / (X_ZCA.max() - X_ZCA.min())   
    
    return X_ZCA_rescaled


def zca_prepare_batch(batch, device=None, **kwargs):
    
    
    data, target = batch
    zca_data = ZCA_whitening(data).reshape(data.shape)
    
    zca_data = Variable(torch.from_numpy(zca_data))
    
    zca_data = zca_data.to(device)

    target = target.to(device)

    return zca_data, target


model = model.cuda()

criterion = nn.CrossEntropyLoss()
lr = 0.01

optimizer = optim.SGD(model.parameters(),lr,momentum=0.9,weight_decay=0.0005,nesterov=True)

#lr_scheduler = ExponentialLR(optimizer, gamma=0.975)
lr_scheduler = MultiStepLR(optimizer, milestones=[70], gamma=0.1)

use_amp = True

#Load state dict
filename='checkpoint_testxxx.pth'
if os.path.isfile(filename):
    print("=> loading checkpoint '{}'".format(filename))
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    #lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
    #print('lr scheduler is')
    #print(checkpoint['lr_scheduler'])
    print("=> loaded checkpoint ")



# Initialize Amp
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", num_losses=1)
   
    
batch = next(iter(train_loader))

batch = None
torch.cuda.empty_cache()

trainer = create_supervised_trainer(
    model, 
    optimizer, 
    criterion, 
    device="cuda",
    prepare_batch=zca_prepare_batch
)


#def output_transform(out):
#    return out['batchloss']

#RunningAverage(output_transform=output_transform).attach(trainer, "batchloss")

print('Start experiment')

trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda engine: lr_scheduler.step())


resume_epoch = 69 # zero-based


def resume_training(engine):
    engine.state.iteration = resume_epoch * len(engine.state.dataloader)
    engine.state.epoch = resume_epoch
    print('Las iteraciones son')
    print(engine.state.iteration)
    print('El epoch actual es')
    print(engine.state.epoch)

#trainer.add_event_handler(Events.STARTED, resume_training)

metrics = {
    'Loss': Loss(criterion),
    'Accuracy': Accuracy(),
    'Precision': Precision(average=True),
    'Recall': Recall(average=True),
    'Top-5 Accuracy': TopKCategoricalAccuracy(k=5)
}

evaluator = create_supervised_evaluator(model, metrics=metrics, device=device,prepare_batch=zca_prepare_batch, non_blocking=True)
train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device,prepare_batch=zca_prepare_batch, non_blocking=True)


cpe = CustomPeriodicEvent(n_epochs=3)
cpe.attach(trainer)


def run_evaluation(engine):
    train_evaluator.run(eval_train_loader)
    evaluator.run(test_loader)

def save_model_and_metrics(engine):
#    print('el num de epochs actual es')
#    print(trainer.state.epoch)
    epoch=engine.state.epoch
    if len(str(epoch))==1:
        epoch='00'+str(epoch)
    if len(str(epoch))==2:
        epoch='0'+str(epoch)
        

trainer.add_event_handler(cpe.Events.EPOCHS_3_STARTED, run_evaluation)
trainer.add_event_handler(Events.COMPLETED, run_evaluation)


trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())

training_saver = ModelCheckpoint("checkpoint_190520",
                                 filename_prefix='checkpoint',
                                 save_interval=None,  # Save every 1000 iterations
                                 n_saved=None,
                                 atomic=True,
                                 save_as_state_dict=True,
                                 require_empty=False,
                                 create_dir=True)
                                 
#Changed from Events.ITERATION_COMPLETED to Events.EPOCH_COMPLETED. EDIT: changed to EPOCH_STARTED every 10                                                                  
trainer.add_event_handler(Events.EPOCH_STARTED(every=3), 
                          training_saver, 
                          {
                              "model": model,
                              "optimizer": optimizer,
                              "lr_scheduler": lr_scheduler
                          })
# Store the best model
def default_score_fn(engine):
    score = engine.state.metrics['Accuracy']
    return score


# Add early stopping
es_patience = 10
es_handler = EarlyStopping(patience=es_patience, score_function=default_score_fn, trainer=trainer)
#evaluator.add_event_handler(Events.COMPLETED, es_handler)
#setup_logger(es_handler._logger)

# Clear cuda cache between training/testing
def empty_cuda_cache(engine):
    torch.cuda.empty_cache()
    import gc
    gc.collect()

trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache)
evaluator.add_event_handler(Events.COMPLETED, empty_cuda_cache)
#train_evaluator.add_event_handler(Events.COMPLETED, empty_cuda_cache)  

num_epochs =80

ProgressBar(persist=True).attach(trainer)

trainer.run(train_loader, max_epochs=num_epochs)

print('The results are')
print(train_evaluator.state.metrics) 
print(evaluator.state.metrics) 

# Dill routine
    
model_copy=dill.dumps(model)
torch.save(model_copy,'complete_model_final.pt')
torch.save(train_evaluator.state.metrics,'metrics_final.pt')








As you can find in various source on how to debug CNN and training:

  • try to overfit the model on a single batch

Seems like this model without BN has a difficulty even to overfit a single batch.
I tried a larger LR -> 0.1 and loss on a single batch stucked on ~4.0
With BN’s seems like it can overfit a batch and can train (as you also observed).
If you want to remove BN’s, probably you need to check conv weights initialization schema and find out an appropriate one.

It seems like you have some bugs:

  1. input data is 40x40
train_transform = Compose([
    RandomCrop(32),
    RandomHorizontalFlip(),
    Pad(4),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    #LinearTransformation(transformation_matrix, mean_vector), 
 ])

usually, they do pad first and than random crop of 32

  1. data is normalized -> do not need to divide by 255 in whitening
def ZCA_whitening(data):
    
    data=data.view(data.size(0),-1)
    X_norm=data/255  # <--- ???

HTH

1 Like