Current Learning Rate and Cosine Annealing

Hi everyone,

I am trying to implement the Cosine LR Annealing paper (, but changing LR in optimizer doesn’t seem to make any difference in the final model. End result is same as keeping the LR constant.

I am updating the LR with this function:

optimizer = torch.optim.Rprop( MyModel.parameters(), lr=INITIAL_LR )

class CosLR():
    def UpdateLR( epoch, optimizer ):
        NewLR = # Long equation goes here
        for param_group in optimizer.param_groups:
            param_group['lr'] = NewLR
        return NewLR 

train loop: every iteration calls UpdateLR() function.

My question is: How can I find out current LR inside an optimizer? Am I updating the LR correctly without disturbing optimizer state info and recreating it from scratch?


in your training loop read and print the lr from your optimizer:

 optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=0.001, momentum=0.9, weight_decay=0.0005)
 LR = StepLR([ (0, 0.001),  (41000,0.0001),  (51000,0.00001),  (61000,-1)])

 ### in your training loop ####

        # learning rate schduler -------
        lr = LR.get_rate(i)
        if lr<0 : break
        adjust_learning_rate(optimizer, lr)
        rate = get_learning_rate(optimizer)[0]  # read lr for debugging
        print(i, rate)


## rates  ------------------------------
def get_learning_rate(optimizer):
    for param_group in optimizer.param_groups:
       lr +=[ param_group['lr'] ]
    return lr

def adjust_learning_rate(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

examples of lr schduler

	## simple stepping rates
	class StepLR():
		def __init__(self, pairs):
			super(StepLR, self).__init__()

			for n in range(N):
				s, r = pairs[n]
				if r <0: s= s+1

			self.rates = rates
			self.steps = steps

		def get_rate(self, epoch=None):

			N = len(self.steps)
			lr = -1
			for n in range(N):
				if epoch >= self.steps[n]:
					lr = self.rates[n]
			return lr

		def __str__(self):
			string = 'Step Learning Rates\n' \
					+ 'rates=' + str(['%7.4f' % i for i in self.rates]) + '\n' \
					+ 'steps=' + str(['%7.0f' % i for i in self.steps]) + ''
			return string

	class DecayLR():
		def __init__(self, base_lr, decay, step):
			super(DecayLR, self).__init__()
			self.step  = step
			self.decay = decay
			self.base_lr = base_lr

		def get_rate(self, epoch=None, num_epoches=None):
			lr = self.base_lr * (self.decay**(epoch // self.step))
			return lr

		def __str__(self):
			string = '(Exp) Decay Learning Rates\n' \
					+ 'base_lr=%0.3f, decay=%0.3f, step=%0.3f'%(self.base_lr, self.decay, self.step)
			return string

	# 'Cyclical Learning Rates for Training Neural Networks'- Leslie N. Smith, arxiv 2017

	class CyclicLR():

		def __init__(self, base_lr=0.001, max_lr=0.006, step=2000., mode='triangular',
					 gamma=1., scale_fn=None, scale_mode='cycle'):
			super(CyclicLR, self).__init__()

			self.base_lr = base_lr
			self.max_lr = max_lr
			self.step = step
			self.mode = mode
			self.gamma = gamma
			if scale_fn == None:
				if self.mode == 'triangular':
					self.scale_fn = lambda x: 1.
					self.scale_mode = 'cycle'
				elif self.mode == 'triangular2':
					self.scale_fn = lambda x: (0.5)**(x-1)
					self.scale_mode = 'cycle'
				elif self.mode == 'exp_range':
					self.scale_fn = lambda x: gamma**(x)
					self.scale_mode = 'iterations'
				self.scale_fn = scale_fn
				self.scale_mode = scale_mode
			self.clr_iterations = 0.
			self.trn_iterations = 0.
			self.history = {}


		def _reset(self, new_base_lr=None, new_max_lr=None,
			"""Resets cycle iterations.
			Optional boundary/step size adjustment.
			if new_base_lr != None:
				self.base_lr = new_base_lr
			if new_max_lr != None:
				self.max_lr = new_max_lr
			if new_step != None:
				self.step = new_step
			self.clr_iterations = 0.

		def clr(self):
			cycle = np.floor(1+self.clr_iterations/(2*self.step))
			x = np.abs(self.clr_iterations/self.step - 2*cycle + 1)
			if self.scale_mode == 'cycle':
				return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
				return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)

		def get_rate(self, epoch=None, num_epoches=None):

			self.trn_iterations += 1
			self.clr_iterations += 1
			lr = self.clr()

			return lr

		def __str__(self):
			string = 'Cyclical Learning Rates\n' \
					+ 'base_lr=%0.3f, max_lr=%0.3f'%(self.base_lr, self.max_lr)
			return string

I believe you have to re-instantiate a new optimizer every time you change your lr if you want to achieve this. This also means you’ll have to store the optimizer.state_dict and then load it into the new optimizer on each iteration.