Unusual epoch and iteration losses

I am using U-Net network with 3D ConvNets on the data of shape (batch_size, 3,32,32,32). My loss plot after training and validation is unusual and weird. I am using stepLR scheduler (4 epochs) and ADAM optimizer with a learning rate of 8e-5 and gamma of 0.9 and a total of 10 epochs. The loss function looks as follows:

It looks as if each new epoch is behaving as it has no memory of the previous learning. I am using the following script:

model = UNet(in_channels=3, out_channels=3)
torch.cuda.empty_cache()

param_file  = sys.argv[1]
kind        = sys.argv[2]data.
stream      = int(sys.argv[3])
params      = yadoc.parameters('/home/nkaushal/my_PARAMS/params_{}.yml'.format(param_file)) 
device      = torch.device("cuda:{}".format(stream))

if kind == 'p':
    a           = sys.argv[3] 
    b           = sys.argv[4] 
    c           = sys.argv[5]
    d           = sys.argv[6]
    print('PARALLELIZATION: TRUE')
    if torch.cuda.device_count() > 1:
        print('DISTRIBUTING MODEL ON DEVICE {}, {}, {} and {} ...'.format(a,b,c,d))
        model   = nn.DataParallel(model, device_ids=[int(a), int(b), int(c), int(d)])
        model.to(f'cuda:{model.device_ids[0]}')
else:
    print('PARALLELIZATION: FALSE')
    print('MODEL ONLY ON DEVICE {}'.format(stream))


optimizer = torch.optim.Adam(model.parameters(), lr=params.net_params.init_lr,,weight_decay=params.net_params.weight_decay) 
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, params.scheduler.step_size, params.scheduler.gamma) #, verbose=True)
myLoss    = torch.nn.MSELoss()


def train(model, trainloader, validloader, loss_criterion, params):
    if not os.path.exists(params.out_path):
        os.makedirs(params.out_path) #---create the output path to save best model

    iter_train_losses  = []
    iter_valid_losses  = []
    epoch_train_losses = []
    epoch_valid_losses = []
    x_valid, x_train   = [],[]

    iterations      = 0
    best_valid_loss = 0.7 

    for epoch in range(params.train.num_epochs):
        get_ini_time = time.time()
        epoch_train_loss = []
        epoch_valid_loss = []
        print('=='*70,'\n')
        model.train()
    
        for train_batch, train_data in enumerate(trainloader):
            iterations += 1
            inputs            = torch.autograd.Variable(train_data[0], requires_grad=False).to(device).permute(0,4,1,2,3) #INPUT : BS,3,32,32,32 
            labels            = torch.autograd.Variable(train_data[1], requires_grad=False).to(device).permute(0,4,1,2,3) #LABEL : BS,3,32,32,32
            optimizer.zero_grad() 
            with torch.set_grad_enabled(True):
                outputs           = model(inputs)
                loss              = torch.sqrt(loss_criterion(outputs,labels)) #----Root Mean Square Loss
                loss.backward() 
                optimizer.step()

            x_train.append(iterations); iter_train_losses.append(loss.item()); epoch_train_loss.append(loss.item())
            np.savetxt('/home/nkaushal/my_LOSSES/iter_train_loss_p{}.txt'.format(param_file), np.array([x_train, iter_train_losses]))
            if (train_batch+1)%2000==0: 
                print('Epoch {} of {} : Train Batch {} of {} : {:.3f}'.format(epoch+1, params.train.num_epochs, train_batch+1, train_batches, loss.item()))
            

            #--VALIDATION
            if (train_batch+1)%params.train.eval_frequency==0 or (train_batch+1)==len(trainloader):
                atime = time.time() 
                model.eval() 
                running_loss = 0.0
                counts	 = 0
                for valid_data in validloader:
                    with torch.no_grad():
                        inputs            = torch.autograd.Variable(valid_data[0], requires_grad=False).cuda(stream).permute(0,4,1,2,3)
                        labels            = torch.autograd.Variable(valid_data[1], requires_grad=False).cuda(stream).permute(0,4,1,2,3)
                        outputs           = model(inputs)
                        batch_loss        = torch.sqrt(loss_criterion(outputs,labels))
                        running_loss     += batch_loss
                        counts           += 1
                valid_loss = running_loss/counts
                x_valid.append(iterations); iter_valid_losses.append(valid_loss.item()); epoch_valid_loss.append(valid_loss.item())
                np.savetxt('/home/nkaushal/my_LOSSES/iter_valid_loss_p{}.txt'.format(param_file), np.array([x_valid, iter_valid_losses]))
                if (valid_loss < best_valid_loss): 
                    best_valid_loss = valid_loss
                    torch.save(model, params.out_path + 'model_p{}_e{}_i{}.pt'.format(param_file, epoch, iterations)) #------------------save complete model. Requires more storage.
                    torch.save(model.state_dict(), params.out_path + 'modelDict_p{}_e{}_i{}.pt'.format(param_file, epoch, iterations)) #----save model parameters only. 
                tot_time = divmod( (time.time()-atime), 60) 
                print('Epoch {} of {} : Validation loss checked after Train Batch {} : {:.3f} in {} min, {} sec'.format(epoch+1, params.train.num_epochs, train_batch+1, valid_loss.item(), tot_time[0], round(tot_time[1],1)))

        epoch_train_losses.append(np.array(epoch_train_loss).mean())
        epoch_valid_losses.append(np.array(epoch_valid_loss).mean())
        np.savetxt('/home/nkaushal/my_LOSSES/epoch_train_loss_p{}.txt'.format(param_file), np.array(epoch_train_losses))
        np.savetxt('/home/nkaushal/my_LOSSES/epoch_valid_loss_p{}.txt'.format(param_file), np.array(epoch_valid_losses))

        print("EPOCH ", epoch+1, " TIME: ~", (time.time()-get_ini_time)//3600, " hours" )
        scheduler.step()

    return None

if params.is_train:
    train(model, trainloader, validloader, myLoss, params)

Also, a side note
torch.optim.lr_scheduler.StepLR() is not recognizing verbose argument)

Are you shuffling the data inside the DataLoader? If not, could you add it and check, if the loss plots would change?

Yes, I am!

if params.is_train:
    print("MODE: Training")
    #----create trainloader
    trainset     = high_dimensional_dataset(params.data, params.train.lIndex, params.train.hIndex, params.train.aug, params.train.get_info) #--make sure that get_info is false
    trainloader  = DataLoader(trainset, batch_size=params.train.batch_size, shuffle=True, num_workers=params.train.num_workers) #-----shuffle
    train_batches = int(len(trainset)/params.train.batch_size)  (=4096*realizations/batch_size)
    print("Trainloader created.")

    #----create validloader
    validset     = high_dimensional_dataset(params.data, params.valid.lIndex, params.valid.hIndex, params.valid.aug,params.valid.get_info) #---AGAIN, get_info should be false
    validloader  = DataLoader(validset, batch_size=params.valid.batch_size, shuffle=True, num_workers=params.valid.num_workers) #----shuffle
    print("Validloader created.")

    #----train
    train(model, trainloader, validloader, myLoss, params)

I cannot see any obvious issues besides that Variables are deprecated since PyTorch 0.4 and shouldn’t be used anymore.
Could you remove the learning rate scheduler and check the loss plots again?

It looks similar without scheduler as well, just a bit changed. But the fluctuations are still unusual. Can the location of scheduler, optimizer and backpropagation in the script be a problem? Can you tell me what to use instead of Variable if it is decrepated or maybe a link?

The placement of the gradient calculation and the optimizer calls look correct.
I still can’t find any issues. As the next debugging steps you could use a small subset of the original data and see, if this loss behavior is still visible in each epoch.

You can use tensors without wrapping them into Variables instead.

inputs = torch.autograd.Variable(train_data[0], requires_grad=False).to(device).permute(0,4,1,2,3)

would be now:

inputs = train_data[0].to(device).permute(0,4,1,2,3)

So, I corrected the loss function. The error was in the script. Earlier, I was enabling the gradients inside each training batch as follows:

for epoch in range(params.train.num_epochs):
    for train_batch, train_data in enumerate(trainloader):
        model.train()
        optimizer.zero_grad()
        inputs = torch.autograd.Variable(train_data[0], requires_grad=False).to(device)
        labels = torch.autograd.Variable(train_data[1], requires_grad=False).to(device)
        with torch.set_grad_enabled(True):
            outputs = model(inputs)
            loss = torch.sqrt(loss_criterion(outputs,labels))
            loss.backward()
            optimizer.step()

So, this time, I enabled them outside the loop of training batches:

for epoch in range(params.train.num_epochs):
    torch.set_grad_enabled(True)
    for train_batch, train_data in enumerate(trainloader):
        model.train()
        optimizer.zero_grad()
        inputs = torch.autograd.Variable(train_data[0], requires_grad=False).to(device)
        labels = torch.autograd.Variable(train_data[1], requires_grad=False).to(device)
        outputs = model(inputs)
        loss = torch.sqrt(loss_criterion(outputs,labels))
        loss.backward()
        optimizer.step()

And it worked!