I am using U-Net network with 3D ConvNets on the data of shape (batch_size, 3,32,32,32). My loss plot after training and validation is unusual and weird. I am using stepLR scheduler (4 epochs) and ADAM optimizer with a learning rate of 8e-5 and gamma of 0.9 and a total of 10 epochs. The loss function looks as follows:
It looks as if each new epoch is behaving as it has no memory of the previous learning. I am using the following script:
model = UNet(in_channels=3, out_channels=3)
torch.cuda.empty_cache()
param_file = sys.argv[1]
kind = sys.argv[2]data.
stream = int(sys.argv[3])
params = yadoc.parameters('/home/nkaushal/my_PARAMS/params_{}.yml'.format(param_file))
device = torch.device("cuda:{}".format(stream))
if kind == 'p':
a = sys.argv[3]
b = sys.argv[4]
c = sys.argv[5]
d = sys.argv[6]
print('PARALLELIZATION: TRUE')
if torch.cuda.device_count() > 1:
print('DISTRIBUTING MODEL ON DEVICE {}, {}, {} and {} ...'.format(a,b,c,d))
model = nn.DataParallel(model, device_ids=[int(a), int(b), int(c), int(d)])
model.to(f'cuda:{model.device_ids[0]}')
else:
print('PARALLELIZATION: FALSE')
print('MODEL ONLY ON DEVICE {}'.format(stream))
optimizer = torch.optim.Adam(model.parameters(), lr=params.net_params.init_lr,,weight_decay=params.net_params.weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, params.scheduler.step_size, params.scheduler.gamma) #, verbose=True)
myLoss = torch.nn.MSELoss()
def train(model, trainloader, validloader, loss_criterion, params):
if not os.path.exists(params.out_path):
os.makedirs(params.out_path) #---create the output path to save best model
iter_train_losses = []
iter_valid_losses = []
epoch_train_losses = []
epoch_valid_losses = []
x_valid, x_train = [],[]
iterations = 0
best_valid_loss = 0.7
for epoch in range(params.train.num_epochs):
get_ini_time = time.time()
epoch_train_loss = []
epoch_valid_loss = []
print('=='*70,'\n')
model.train()
for train_batch, train_data in enumerate(trainloader):
iterations += 1
inputs = torch.autograd.Variable(train_data[0], requires_grad=False).to(device).permute(0,4,1,2,3) #INPUT : BS,3,32,32,32
labels = torch.autograd.Variable(train_data[1], requires_grad=False).to(device).permute(0,4,1,2,3) #LABEL : BS,3,32,32,32
optimizer.zero_grad()
with torch.set_grad_enabled(True):
outputs = model(inputs)
loss = torch.sqrt(loss_criterion(outputs,labels)) #----Root Mean Square Loss
loss.backward()
optimizer.step()
x_train.append(iterations); iter_train_losses.append(loss.item()); epoch_train_loss.append(loss.item())
np.savetxt('/home/nkaushal/my_LOSSES/iter_train_loss_p{}.txt'.format(param_file), np.array([x_train, iter_train_losses]))
if (train_batch+1)%2000==0:
print('Epoch {} of {} : Train Batch {} of {} : {:.3f}'.format(epoch+1, params.train.num_epochs, train_batch+1, train_batches, loss.item()))
#--VALIDATION
if (train_batch+1)%params.train.eval_frequency==0 or (train_batch+1)==len(trainloader):
atime = time.time()
model.eval()
running_loss = 0.0
counts = 0
for valid_data in validloader:
with torch.no_grad():
inputs = torch.autograd.Variable(valid_data[0], requires_grad=False).cuda(stream).permute(0,4,1,2,3)
labels = torch.autograd.Variable(valid_data[1], requires_grad=False).cuda(stream).permute(0,4,1,2,3)
outputs = model(inputs)
batch_loss = torch.sqrt(loss_criterion(outputs,labels))
running_loss += batch_loss
counts += 1
valid_loss = running_loss/counts
x_valid.append(iterations); iter_valid_losses.append(valid_loss.item()); epoch_valid_loss.append(valid_loss.item())
np.savetxt('/home/nkaushal/my_LOSSES/iter_valid_loss_p{}.txt'.format(param_file), np.array([x_valid, iter_valid_losses]))
if (valid_loss < best_valid_loss):
best_valid_loss = valid_loss
torch.save(model, params.out_path + 'model_p{}_e{}_i{}.pt'.format(param_file, epoch, iterations)) #------------------save complete model. Requires more storage.
torch.save(model.state_dict(), params.out_path + 'modelDict_p{}_e{}_i{}.pt'.format(param_file, epoch, iterations)) #----save model parameters only.
tot_time = divmod( (time.time()-atime), 60)
print('Epoch {} of {} : Validation loss checked after Train Batch {} : {:.3f} in {} min, {} sec'.format(epoch+1, params.train.num_epochs, train_batch+1, valid_loss.item(), tot_time[0], round(tot_time[1],1)))
epoch_train_losses.append(np.array(epoch_train_loss).mean())
epoch_valid_losses.append(np.array(epoch_valid_loss).mean())
np.savetxt('/home/nkaushal/my_LOSSES/epoch_train_loss_p{}.txt'.format(param_file), np.array(epoch_train_losses))
np.savetxt('/home/nkaushal/my_LOSSES/epoch_valid_loss_p{}.txt'.format(param_file), np.array(epoch_valid_losses))
print("EPOCH ", epoch+1, " TIME: ~", (time.time()-get_ini_time)//3600, " hours" )
scheduler.step()
return None
if params.is_train:
train(model, trainloader, validloader, myLoss, params)
Also, a side note
torch.optim.lr_scheduler.StepLR() is not recognizing verbose argument)