Memory leak on optimizer step

I am training an autoencoder architecture, using adamw and a custom optimizer setup like this:

class custom_Optimizer(torch.optim.Optimizer):
    def __init__(self,params,gen_optimizer,disc_optimizer,gen_scheduler=None,disc_scheduler=None):

        self.gen_optimizer:Optimizer = gen_optimizer
        self.disc_optimizer:Optimizer = disc_optimizer

        self.gen_scheduler = gen_scheduler
        self.disc_scheduler = disc_scheduler


        super().__init__(params,{})

    def step(self,loss,gen_loss,disc_loss):
        """optimizer step during training.

        Args:
            gen_loss: gen loss for backpropagation.
            disc_loss: discriminant loss for backpropagation.
            
        Returns:
            Updated learning rate of optimizers

        """
        
        learning_rates = []
        #call optimizers
        #TODO : check if two calls are needed and if set retain to true is necessary

        loss.backward()
        learning_rates.append(self.step_disc())
        learning_rates.append(self.step_gen())
        return learning_rates
    
    def step_gen(self):
        """gen optimisation step.

        Args:
            gen_loss: gen loss for backpropagation.
            retain: wether or not to retain previous graph for backpropragation
            
        Returns:
            Updated learning rate of gen optimizers

        """
        if(self.gen_optimizer):
            self.gen_optimizer.step()  # update weight
            self.gen_optimizer.zero_grad()  # reset gradient

            # step the learning rate
            if(not self.gen_scheduler is None):
                self.gen_scheduler.step()
            lrs = self.get_lr(self.gen_optimizer)
            return lrs
        else: return None

    def step_disc(self):
        """discriminant optimisation step.

        Args:
            disc_loss: discriminant loss for backpropagation.
            retain: wether or not to retain previous graph for backpropragation
            
        Returns:
            Updated learning rate of discriminant optimizers

        """
        if(self.disc_optimizer):
            self.disc_optimizer.step()  # update weight
            self.disc_optimizer.zero_grad()  # reset gradient

            # step the learning rate
            if(not self.disc_scheduler is None):
                self.disc_scheduler.step()
            lrs = self.get_lr(self.disc_optimizer)

            return lrs
        else: return None

I also have a simple custom model built as such:

class Custom_Model(nn.Module):
    def __init__(self,gen,discriminant):
        super().__init__()

        self.gen = gen
        self.discriminant = discriminant

    def forward(self,x):
        
        generated = self.gen(x)
        score = self.discriminant(generated)

        return generated, score

to begin training i initialize the optimizers using this code:

gen_optimizer = torch.optim.AdamW(model.gen.parameters(), lr=config.train_config.gen_max_lr, weight_decay=config.train_config.gen_weight_decay)
disc_optimizer = torch.optim.AdamW(model.discriminant.parameters(), lr=config.train_config.disc_max_lr, weight_decay=config.train_config.disc_weight_decay)
#initialize schedulers
gen_sched = torch.optim.lr_scheduler.OneCycleLR(gen_optimizer, config.train_config.gen_max_lr, epochs=config.train_config.epoch,
                                            steps_per_epoch=len(train_loader))
disc_sched = torch.optim.lr_scheduler.OneCycleLR(disc_optimizer, config.train_config.disc_max_lr, epochs=config.train_config.epoch,
                                            steps_per_epoch=len(train_loader))
optimizer = custom_Optimizer(model.parameters(),gen_optimizer,disc_optimizer,gen_sched,disc_sched)

On the first pass of loss.backward(), in the step function of custom optimizer, some memory is lost, but this only occur once.

However every pass of self.gen_optimizer.step() seems to leak memory at every iteration of the epoch.

Its a few megas at first but gradually increase, this doesnt occur for my disc_optimizer.step() function which is weird since they are built the same way, and i have trouble to figure out where the leak is coming from since i use zero_grad on my model and optimizers during training.

My knowledge of the graph construction and conservation is not the best, any idea where my issue might come from would help.

Update : okay my mistake was accumulating a tensor accuracy metric in the main loop which would keep the graph in memory, i do not understand however why the memory increased on the optimizer.step call