Some tensors are not getting deleted after the first epoch

Hi, I’ve encountered a problem with the Pidnet-Model. I’ve modified it for my purposes, but some tensors are not being removed even after using gc.collect following the first training. However, afterward, it seems to remain stable. That means that my GPU Ram usage increases only after first epoch. Can you identify which step might still have a reference to some tensors?

output of debug dataset

count of tensors on gpu 958
Epoch: [0/2] Iter:[0/15], Time: 3.64, lr: [0.00025910131519070223], Loss: 19.601892, Acc:0.023638, Semantic loss: 11.114876, BCE loss: 0.710389, SB loss: 7.776628
Epoch: [0/2] Iter:[10/15], Time: 0.42, lr: [0.00017988190752092693], Loss: 13.189713, Acc:0.052125, Semantic loss: 7.945725, BCE loss: 0.768364, SB loss: 4.475624
count of tensors on gpu 1926

def train(config, epoch, num_epoch, epoch_iters, base_lr,
          num_iters, trainloader, optimizer, model,
          writer_dict, cosine_decay_scheduler=None):
    # Training
    print('training start')
    debug_util.print_tensors()

    batch_time = AverageMeter()
    ave_loss = AverageMeter()
    ave_acc = AverageMeter()
    avg_sem_loss = AverageMeter()
    avg_bce_loss = AverageMeter()
    tic = time.time()
    cur_iters = epoch * epoch_iters
    writer = writer_dict['writer']
    sum_iters = len(trainloader)
    global_steps = writer_dict['train_global_steps']

    print('train loader')
    debug_util.print_tensors()

    for i_iter, batch in enumerate(trainloader, 0):
        images, labels, bd_gts, _, _ = batch
        losses, _, acc, loss_list = model(images.cuda(), labels.long().cuda(), bd_gts.float().cuda())
        loss = losses.mean()
        acc = acc.mean()

        model.zero_grad()
        loss.backward()
        optimizer.step()
        # measure elapsed time
        batch_time.update(time.time() - tic)
        tic = time.time()

        # update average loss
        ave_loss.update(loss.item())
        ave_acc.update(acc.item())
        avg_sem_loss.update(loss_list[0].mean().item())
        avg_bce_loss.update(loss_list[1].mean().item())


        if cosine_decay_scheduler is not None:
            cosine_decay_scheduler.step(epoch + i_iter / sum_iters)
        else:
            lr = adjust_learning_rate(optimizer,
                                      base_lr,
                                      num_iters,
                                      i_iter + cur_iters)

        if i_iter % config.PRINT_FREQ == 0:
            msg = 'Epoch: [{}/{}] Iter:[{}/{}], Time: {:.2f}, ' \
                  'lr: {}, Loss: {:.6f}, Acc:{:.6f}, Semantic loss: {:.6f}, BCE loss: {:.6f}, SB loss: {:.6f}'.format(
                epoch, num_epoch, i_iter, epoch_iters,
                batch_time.average(), [x['lr'] for x in optimizer.param_groups], ave_loss.average(),
                ave_acc.average(), avg_sem_loss.average(), avg_bce_loss.average(),
                ave_loss.average() - avg_sem_loss.average() - avg_bce_loss.average())
            logging.info(msg)
        del images, labels, bd_gts, loss_list, loss, losses, acc, _, batch
    del msg
    debug_util.print_tensors()
    writer.add_scalar('train_loss', ave_loss.average(), global_steps)
    writer_dict['train_global_steps'] = global_steps + 1
    del ave_loss, global_steps, writer, ave_acc, avg_sem_loss, avg_bce_loss, writer_dict

    gc.collect()
    torch.cuda.empty_cache()