How to set up Warmup followed by ReduceLRonPlateau?

I want to linearly increase my learning rate using LinearLR followed by using ReduceLROnPlateau.
I assumed we could use SequentialLR to achieve the same as below

            warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
                self.model_optim, start_factor=0.05, end_factor=1, total_iters=3
            )
            reduce_lrop_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                self.model_optim,
                "min",
                patience=self.reduce_lr_on_plateau_patience,
                threshold=self.reduce_lr_on_plateau_threshold,
            )
            lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
                self.model_optim,
                schedulers=[warmup_scheduler, reduce_lrop_scheduler],
                milestones=[5],
            )

As expected, the LR goes up from almost 0 and reaches a constant value, but then when I hit the first milestone, I get an error saying ReduceLROnPlateau doesn’t have an attribute called get_last_lr.
Full error below:

  File "/home/<user>/miniconda3/envs/pyt/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1560, in lr_scheduler_step
    scheduler.step()
  File "/home/<user>/miniconda3/envs/pyt/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 647, in step
    self._last_lr = self._schedulers[idx].get_last_lr()
AttributeError: 'ReduceLROnPlateau' object has no attribute 'get_last_lr'

I’m using PyTorch lightning to handle the optimisation but I assume the problem lies in incompatibility of ReduceLROnPlateau with SequentialLR. I looked around in different forums but couldn’t find a satisfactory answer.
Side note: I’d like the final learning rate to be 3e-5 after the warmup so I set the initial LR as 3e-5 and end_factor as 1 with initial factor being 0.05. This results in the final lr after warm up to be 1.5e-6 which is off by a factor of 20. I don’t quite understand why this happens, help on that would also be appreciated.
Thanks.

I’m on torch version 1.11.0+cu113, and pytorch-lightning version 1.6.4

2 Likes

I could achieve this using two optimizers and two lr schedulers. first one runs for warmup number of batches, second one runs for train_batches - warmup_batches.
Screenshots from TB

Here are the relevant code snippets from LightningModule class:

def create_optimizer(self):
    return torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=0.1)
        
def lr_warmup_config(self):
    def warmup(step):
        """
        This method will be called for ceil(warmup_batches/accum_grad_batches) times,
        warmup_steps has been adjusted accordingly
        """
        if self.warmup_steps <= 0:
            factor = 1
        else:
            factor = min(step / self.warmup_steps, 1)
        return factor

    opt1 = self.create_optimizer()
    return {
        'frequency': self.warmup_batches,
        'optimizer': opt1,
        'lr_scheduler': {
            'scheduler': torch.optim.lr_scheduler.LambdaLR(opt1, warmup),
            'interval': 'step',
            'frequency': 1,
            'name': 'lr/warmup'
        },
    }


def lr_decay_config(self):
    opt2 = self.create_optimizer()
    return {
        'frequency': self.train_batches - self.warmup_batches,
        'optimizer': opt2,
        'lr_scheduler': {
            # threshold=0.01, threshold_mode='rel' means :
            #   metric should be improving by at least 1% within patience steps
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(
                opt2, 'min', factor=self.args.lrdecay_factor, patience=self.args.lrdecay_patience,
                verbose=False, threshold=self.args.lrdecay_threshold, threshold_mode='rel',
                cooldown=self.args.lrdecay_cooldown, min_lr=self.args.lrdecay_min_lr),
            'interval': 'epoch',
            'frequency': 1,
            'monitor': self.args.lrdecay_monitor,
            'strict': False,
            'name': 'lr/reduce_on_plateau',
        }
    }


def configure_optimizers(self):
    return (
        self.lr_warmup_config(),
        self.lr_decay_config()
    )
1 Like

I’ve managed to do it with 1 optimizer and 2 schedulers. I do automatic step for ReduceLRonPlateau and manual step for warmup in the training_step.

def training_step(self, train_batch, batch_idx):
    x, y = train_batch
    logits = self.forward(x)
    loss = self.cross_entropy_loss(logits, y)
    self.train_loss.append(loss.item())
    self.train_metrics(logits, y)
    self.log_dict(self.train_metrics, on_step=False, on_epoch=True)
    self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=False)

    if self.global_step < self.warmup_steps:
        _, scheduler = self.lr_schedulers()
        scheduler.step()
    return {"loss": loss}

def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr_rate, betas=(0.9, 0.99), weight_decay=0.1)
    
        warmup_duration = self.warmup_steps
    
        warmup = torch.optim.lr_scheduler.LinearLR(
            optimizer,
            start_factor=0.0001,
            end_factor=1.0,
            total_iters=warmup_duration,
        )
        red_plateau = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            factor=0.5,
            patience=1,
            min_lr=1e-7,
            verbose=True
        )
        
    
        lr_scheduler = {
            "scheduler": red_plateau,
            "interval": "epoch",
            "frequency": 1,
            "monitor": "val_loss",
        }
        return [optimizer], [lr_scheduler, {'scheduler': warmup}]

that seems to be causing an issue as scheduler.step is called before optimizer.step…

I am guessing this should be a better example (from the lightning team):


# Learning rate warm-up
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
    # update params
    optimizer.step(closure=optimizer_closure)

    # manually warm up lr without a scheduler
    if self.trainer.global_step < 500:
        lr_scale = min(1.0, float(self.trainer.global_step + 1) / 500.0)
        for pg in optimizer.param_groups:
            pg["lr"] = lr_scale * self.learning_rate