What am I doing wrong here? Before, I didn’t have a scheduler, the learning rate would be updated according to steps using a simple function that would decrease the learning rate at each step defined. Now, I added a scheduler (ReduceLROnPlateau),
and when I run the training, it just freezes after the first epoch:
[ Fri Dec 27 19:28:22 2019 ] Training epoch: 1
Am I calling the scheduler wrong?
def init(self, arg):
self.arg = arg
self.save_arg()
self.load_data()
self.load_model()
self.load_optimizer()
self.load_scheduler()
…
def load_optimizer(self):
if self.arg.optimizer == ‘SGD’:
self.optimizer = optim.SGD(
self.model.parameters(),
lr=self.arg.base_lr,
momentum=0.9,
nesterov=self.arg.nesterov,
weight_decay=self.arg.weight_decay)
optimor = optim.SGD
elif self.arg.optimizer == ‘Adam’:
self.optimizer = optim.Adam(
self.model.parameters(),
lr=self.arg.base_lr,
weight_decay=self.arg.weight_decay)
else:
raise ValueError()
def load_scheduler(self):
self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True)
…
def train(self, epoch, save_model=False):
self.model.train()
self.print_log('Training epoch: {}'.format(epoch + 1))
loader = self.data_loader['train']
#lr = self.adjust_learning_rate(epoch)
loss_value = []
self.record_time()
timer = dict(dataloader=0.001, model=0.001, statistics=0.001)
for batch_idx, (data, label) in enumerate(loader):
# get data
data = Variable(
data.float().cuda(self.output_device), requires_grad=False)
label = Variable(
label.long().cuda(self.output_device), requires_grad=False)
timer['dataloader'] += self.split_time()
# forward
output = self.model(data)
loss = self.loss(output, label)
# backward
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.lr_scheduler.step()
loss_value.append(loss.data[0])
timer['model'] += self.split_time()
# statistics
if batch_idx % self.arg.log_interval == 0:
self.print_log(
'\tBatch({}/{}) done. Loss: {:.4f} lr:{:.6f}'.format(
batch_idx, len(loader), loss.data[0], lr))
timer['statistics'] += self.split_time()
# statistics of time consumption and loss
proportion = {
k: '{:02d}%'.format(int(round(v * 100 / sum(timer.values()))))
for k, v in timer.items()
}
self.print_log(
'\tMean training loss: {:.4f}.'.format(np.mean(loss_value)))
self.print_log(
'\tTime consumption: [Data]{dataloader}, [Network]{model}'.format(
**proportion))
if save_model:
model_path = '{}/epoch{}_model.pt'.format(self.arg.work_dir,
epoch + 1)
state_dict = self.model.state_dict()
weights = OrderedDict([[k.split('module.')[-1],
v.cpu()] for k, v in state_dict.items()])
torch.save(weights, model_path)