‘CUDA out of memory’ after two training epoch

Hi @ptrblck , I’m having some trouble on training,It’s the Trackback:

Traceback (most recent call last):
  File "3D_train.py", line 242, in <module>
    trainer.train()
  File "3D_train.py", line 235, in train
    self.fit_func.fit(epoch)
  File "/data/lianghao/lidar_and_4D_imaging_radar_fusion_demo/3D-MAN-reproduction/utils/fit.py", line 360, in fit
    all_loss = self.training_step(epoch)
  File "/data/lianghao/lidar_and_4D_imaging_radar_fusion_demo/3D-MAN-reproduction/utils/fit.py", line 152, in training_step
    output = self.model(lidar_pillar, self.opts) # time cost : 0.03377 sec
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 619, in forward
    output = self.module(*inputs[0], **kwargs[0])
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/data/lianghao/lidar_and_4D_imaging_radar_fusion_demo/3D-MAN-reproduction/model/FSD_module.py", line 144, in forward
    p3,p4,p5 = self.yolo_precat(pesu_img)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/data/lianghao/lidar_and_4D_imaging_radar_fusion_demo/3D-MAN-reproduction/model/backbone/yolo.py", line 119, in forward
    x2, x1, x0 = self.backbone(x)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/data/lianghao/lidar_and_4D_imaging_radar_fusion_demo/3D-MAN-reproduction/model/backbone/CSPdarknet.py", line 177, in forward
    out5 = self.stages[4](out4)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/data/lianghao/lidar_and_4D_imaging_radar_fusion_demo/3D-MAN-reproduction/model/backbone/CSPdarknet.py", line 105, in forward
    x = self.downsample_conv(x)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/data/lianghao/lidar_and_4D_imaging_radar_fusion_demo/3D-MAN-reproduction/model/backbone/CSPdarknet.py", line 32, in forward
    x = self.conv(x)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 423, in forward
    return self._conv_forward(input, self.weight)
  File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/conv.py", line 420, in _conv_forward
    self.padding, self.dilation, self.groups)
RuntimeError: CUDA out of memory. Tried to allocate 18.00 MiB (GPU 1; 31.75 GiB total capacity; 30.05 GiB already allocated; 11.50 MiB free; 30.43 GiB reserved in total by PyTorch)

And the training code architecture is the following:

def training_step(self, epoch):
        all_loss_train = torch.zeros((1,10)).cuda()
        iter_now=0
        for self.iter, batch in enumerate(self.train_loader): 
            if self.iter >= self.train_all_iter: 
                break
            iter_now+=1
            self.model.train() 
            lidar_pillar, boxes = batch[0], batch[1]
            lidar_pillar = torch.from_numpy(lidar_pillar).type(torch.FloatTensor).cuda()
            boxes = [torch.from_numpy(ann).type(torch.FloatTensor).cuda() for ann in boxes]
            self.optimizer.zero_grad()
            output = self.model(lidar_pillar, self.opts) # time cost : 0.03377 sec
            # -------------------
            # loss
            # -------------------
            # loss all, loss heatmap, loss offset, loss size, loss orientation, loss class, loss iou, ...
            loss_train_arr = torch.zeros([1, 10]).cuda()
            
            if self.opts.using_multi_scale == 1:
                num_output = len(output)
            else:
                num_output = 1
            
            for i in range(num_output):
                loss_train_arr += self.fusionLoss(output[i], boxes, self.opts)#time cost : 22.08293 sec
            
            loss_train_arr = loss_train_arr / num_output
            
            #loss_train_arr([1,10])
            #import pdb;pdb.set_trace()
            # ----------------------#
            #   backward
            # ----------------------#
            if loss_train_arr[0, 0].item() != 0: #time cost : 0.05921 sec
                loss_train_arr[0, 0].backward()
               
            self.loss_log(loss_train_arr)
            if self.opts.local_rank == self.opts.main_gpuid:
                progress_str = "epoch: {}/{}, iter: {}/{} optim_lr: {}, sche_lr: {}".format(
                    epoch + 1, self.end_epoch, self.iter + 1, self.train_all_iter, self.optimizer.param_groups[0]['lr'], self.get_lr()
                )
                # import pdb;pdb.set_trace()
                logger.info(self.optimizer.param_groups[0]['params'][0])
                logger.info("{}".format(progress_str))
                
                #print("{}".format(progress_str))
            self.optimizer.step()
            self.lr_scheduler.step()
            print('Memory cached in GPU:' ,{torch.cuda.memory_cached()})
            print('Memory allocated in GPU:' ,{torch.cuda.memory_allocated()})
            if np.isnan(loss_train_arr[0,0].item()):
                continue
            # collect all loss
            all_loss_train += loss_train_arr
        all_loss_train = all_loss_train/self.train_all_iter
        self.loss_log(all_loss_train)
        return all_loss_train[0,0]

I guess self.loss_log stores the result without detaching its graph.
If you designed loss_train_arr to save only values but not graph, then

loss_train_arr += self.fusionLoss(output[i], boxes, self.opts).item()

Ok, I’ll try. But one thing that bothers me is that my code worked fine before, but after I increase the number of training samples (maybe), it always OOM after a few epochs, but I’m pretty sure my input sizes are consistent, does the number of training samples affect the gpu memory usage?

Sure. If I am right, your loss_train_arr is proportionate to the size of dataset.

Ok thank you, i will try it and report.

I try it but i found i need save the loss array as graph to do this:

loss_train_arr[0, 0].backward()

My loss array format is the following:

tensor([[20.4471,  1.4436,  2.4890,  0.0752,  1.1610,  3.2490,  0.9942, 11.0352,
          0.0000,  0.0000]], device='cuda:0', grad_fn=<AddBackward0>)

I try to detach() and item() and code is the following:

for i in range(num_output):
      for loss_arr in (self.fusionLoss(output[i], boxes, self.opts)[0,:]):
             loss_train_arr += loss_arr.detach().item().requires_grad_(True)

The loss_train_arr i get like the follow:

tensor([[70.9797, 70.9797, 70.9797, 70.9797, 70.9797, 70.9797, 70.9797, 70.9797,
         70.9797, 70.9797]], device='cuda:1')

And on the loss_train_arr[0, 0].backward() i get the error:

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

So how to deal that?

Oops my mistake
What does self.loss_log do?

Record the loss of each iteration.

Could you provide the code?

ok,i used 6 loss to get a fusion loss,and fusion loss code is the following:

class fusion_loss_base(nn.Module):
    def __init__(self):
        super().__init__()
        self.heatmap_loss = FocalLoss()
        self.offset_loss = RegLossL1()
        self.size_loss = RegLossL1()
        self.orientation_loss = multi_bin_loss()
        self.cls_loss = cls_loss()
        self.iou_loss = iou_loss()
        pass

    def forward(self, prediction, groundTruth, opt):
        # prediction: (location, x_offset, y_offset, W, H, orientation[bin1,bin2,sin,cos], class)
        #label bbox: [id, center_x, center_y, width, length, angle,class]
        # 0 bus
        # 1 car
        # 2 bicycle
        # 3 pedestrian
        # 4 tricycle
        # 5 semitrailer
        # 6 truck
        [_,_,width,_] =  prediction.shape #80 40 20 channel=16
        bs = len(groundTruth)
        scale = 640/width #8
        # loss all, loss heatmap, loss offset, loss size, loss orientation, loss class, loss iou, ...
        loss_arr = torch.zeros([1,10]).cuda()
        #import pdb;pdb.set_trace()
        heatmap_num=0
        for i in range(bs):
            batch_target = groundTruth[i]
            
            [gt_map, gt_mask] = gene_multiScaleGTmap(batch_target, scale,opt) #17sec
            # gt_map=[ 0 gaussian_map
            #          1 offset_x
            #          2 offset_y
            #          3 z
            #          4 width
            #          5 length
            #          6 height
            #          7 angle <pi or not,if <pi value=0 else value=1
            #          8 angle
            #          9 class       ]
            gt_map = gt_map.cuda()
            gt_mask = gt_mask.cuda()
            if gt_mask.sum() == 0:
                break
            # -----------------------------#
            # key point heatmap loss
            # -----------------------------#
            pred_heatmap = torch.sigmoid(prediction[i,0,:,:])
            gt_heatmap = gt_map[0,0,:,:]
            #import pdb;pdb.set_trace()
            loss_arr[0,1] += self.heatmap_loss(pred_heatmap, gt_heatmap)
            # -----------------------------#
            # x,y offset loss 
            # -----------------------------#
            pred_offset_map = scale*torch.sigmoid(prediction[i,1:3,:,:])
            gt_offset_map = gt_map[0,1:3,:,:]
            loss_arr[0,2] += 0.01*self.offset_loss(pred_offset_map, gt_offset_map, gt_mask)
            # -----------------------------#
            # z offset loss 
            # -----------------------------#
            pred_offset_map = prediction[i,3,:,:]
            gt_offset_map = gt_map[0,3,:,:]
            loss_arr[0,3] += 0.01*self.offset_loss(pred_offset_map, gt_offset_map, gt_mask)
            # -----------------------------#
            # height, width loss
            # -----------------------------#
            if opt.chooseLoss == 0:
                pred_size_map = prediction[i,4:7,:,:]
            if opt.chooseLoss == 0 or opt.chooseLoss == 1 or opt.chooseLoss == 3:
                gt_size_map = gt_map[0,4:7,:,:]
                loss_arr[0,4] += 0.01*self.size_loss(pred_size_map, gt_size_map, gt_mask)
            # -----------------------------#
            # orientation loss
            # [bin1,bin2,offset]
            # -----------------------------#
            pred_orientation_map = prediction[i,7:9,:,:]
            gt_orientation_map = gt_map[0,7:9,:,:]
            angle_bin_loss, angle_offset_loss = self.orientation_loss(pred_orientation_map, gt_orientation_map, gt_mask)
            loss_arr[0,5] += 0.1*angle_bin_loss
            loss_arr[0,6] += 0.1*angle_offset_loss
            # -----------------------------#
            # class loss
            # [bin1,bin2,bin3,bin4,bin5,bin6,bin7]
            # -----------------------------#
            pred_cls_map = prediction[i,9:,:,:]
            gt_cls_map = gt_map[0,9,:,:]
            try:
                loss_arr[0,7] += 0.1*self.cls_loss(pred_cls_map, gt_cls_map, gt_mask)
                #import pdb;pdb.set_trace()
            except:
                logger.error(pred_cls_map[...,gt_mask==1])
                print(gt_cls_map[...,gt_mask==1].shape)
                import pdb;pdb.set_trace()

        loss_arr[0, 0] += loss_arr[0, 1:].sum()
        loss_arr = loss_arr/bs
        # loss all, loss heatmap, loss offset xy, loss z, loss size, loss orientation, loss class
        return loss_arr

And this is complete code of fit.py:

# -- coding: utf-8 --
from turtle import pd
from tqdm import tqdm
import numpy as np
import torch
import matplotlib.pyplot as plt
from evaluator.utils_mAP import multi_channel_object_decode
from pycocotools.coco import COCO
from collections import defaultdict
import copy
from evaluator.MVDNet_mAPtools import RobotCarCOCOeval
from loguru import logger
from utils.allreduce_norm import all_reduce_norm
from utils.dist import gather, synchronize
import os
import itertools
import time

class fit_func():
    def __init__(self,
                 model,
                 fusionLoss,
                 optimizer,
                 train_all_iter,
                 val_all_iter,
                 train_loader,
                 val_loader,
                 lr_scheduler,
                 opts,
    ):
        self.model = model
        self.fusionLoss = fusionLoss
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        self.train_all_iter = train_all_iter
        self.val_all_iter = val_all_iter
        self.iter = 0
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.opts = opts
        self.log_path = os.path.join(opts.output_dir, opts.experiment_name)
        self.start_epoch = opts.start_epoch
        self.end_epoch = opts.end_epoch

        self.bbox_generator = multi_channel_object_decode()

        logger.info("args: {}".format(opts))

    def save_checkpoint(self, state, save_dir, model_name="final_model"):
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        filename = os.path.join(save_dir, model_name + "_ckpt.pth")
        print('save checkpoint '+filename)
        torch.save(state, filename)

    def save_model(self, model_name, epoch, ap_50):
        state = {
            "epoch": epoch + 1,
            "model": self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'ap_50': ap_50
            
        }
        self.save_checkpoint(state, self.log_path, model_name)

    def append_loss(self, train_loss, val_mAP):
        with open(os.path.join(self.opts.output_dir, self.opts.experiment_name, "epoch_loss" + ".txt"), 'a') as f:
            f.write(str(train_loss))
            f.write("\n")
        with open(os.path.join(self.opts.output_dir, self.opts.experiment_name, "epoch_val_mAP" + ".txt"), 'a') as f:
            f.write(str(val_mAP))
            f.write("\n")

    def loss_log(self, loss_arr):
        loss_dict = {
            'loss_all': loss_arr[0, 0].item(),
            'heatmap_loss': loss_arr[0, 1].item(),
            'xy_offset_loss': loss_arr[0, 2].item(),
            'z_offset_loss': loss_arr[0, 3].item(),
            'wlh_loss': loss_arr[0, 4].item(),
            'angle_bin': loss_arr[0, 5].item(),
            'angle_offset': loss_arr[0, 6].item(),
            'cls_loss': loss_arr[0, 7].item()
        }
        loss_str = ", ".join(
            ["{}: {:.3f}".format(k, v) for k, v in loss_dict.items()]
        )
        if self.opts.local_rank == self.opts.main_gpuid:
            logger.info(loss_str)
            #print(loss_str)

    def creat_coco(self):
        # ----------------------#
        # coco mAP
        # ----------------------#
        coco_gt = COCO()
        coco_gt.dataset = dict()
        coco_gt.anns = dict()
        coco_gt.cats = dict()
        coco_gt.imgs = dict()
        coco_gt.imgToAnns = defaultdict(list)
        coco_gt.catToImgs = defaultdict(list)
        coco_gt.dataset["images"] = []

        coco_gt.dataset["categories"] = []
        category = dict()
        category["supercategory"] = "vehicle"
        category["id"] = 1
        category["name"] = "car"
        coco_gt.dataset["categories"].append(category)
        coco_gt.dataset["annotations"] = []

        return coco_gt

    def get_lr(self):
        for param_group in self.optimizer.param_groups:
            return param_group['lr']

    @property
    def progress_in_iter(self):
        return self.epoch * self.train_all_iter + self.iter

    def training_step(self, epoch):
        # batch*pillars*N*D
        
        all_loss_train = torch.zeros((1,10)).cuda()
        
        iter_now=0
        for self.iter, batch in enumerate(self.train_loader): 
            if self.iter >= self.train_all_iter: 
                break
            iter_now+=1
            # start_time = time.time()
            self.model.train() 
            #lidar_pillar list(numpy(9,49,)
            #boxes        list(numpy(box),numpy(box))
            lidar_pillar, boxes = batch[0], batch[1]
            lidar_pillar = torch.from_numpy(lidar_pillar).type(torch.FloatTensor).cuda()
            boxes = [torch.from_numpy(ann).type(torch.FloatTensor).cuda() for ann in boxes]
            # 
            self.optimizer.zero_grad()
            # forward
            #outpu tuple(out3(2,11,88,88),ou4(2,11,44,44),out5(2,11,22,22))
            
            output = self.model(lidar_pillar, self.opts) # time cost : 0.03377 sec

            #import pdb;pdb.set_trace()
            # -------------------
            # loss
            # -------------------
            # loss all, loss heatmap, loss offset, loss size, loss orientation, loss class, loss iou, ...
            loss_train_arr = torch.zeros([1, 10]).cuda()
            
           
            
            if self.opts.using_multi_scale == 1:
                num_output = len(output)
            else:
                num_output = 1
            
            for i in range(num_output):
                loss_train_arr += self.fusionLoss(output[i], boxes, self.opts)#time cost : 22.08293 sec
           
            loss_train_arr = loss_train_arr / num_output
            
            #loss_train_arr([1,10])
            #import pdb;pdb.set_trace()
            # ----------------------#
            #   backward
            # ----------------------#
           
            if loss_train_arr[0, 0].item() != 0: #time cost : 0.05921 sec
                loss_train_arr[0, 0].backward()
                #torch.nn.utils.clip_grad_norm_(parameters=self.model.parameters(), max_norm=1, norm_type=2)
            self.loss_log(loss_train_arr)
            if self.opts.local_rank == self.opts.main_gpuid:
                progress_str = "epoch: {}/{}, iter: {}/{} optim_lr: {}, sche_lr: {}".format(
                    epoch + 1, self.end_epoch, self.iter + 1, self.train_all_iter, self.optimizer.param_groups[0]['lr'], self.get_lr()
                )
                # import pdb;pdb.set_trace()
                logger.info(self.optimizer.param_groups[0]['params'][0])
                logger.info("{}".format(progress_str))
                
                #print("{}".format(progress_str))
            self.optimizer.step()
            self.lr_scheduler.step()
         
            if np.isnan(loss_train_arr[0,0].item()):
                continue
            # collect all loss
            
            all_loss_train += loss_train_arr
            # end = time.time()
            # running_time = end-start_time
            # print('time cost : %.5f sec' %running_time)
        # get average loss
        all_loss_train = all_loss_train/self.train_all_iter
        self.loss_log(all_loss_train)
        
        return all_loss_train[0,0]

    
    def fit(self, epoch):
        self.epoch = epoch
        # ----------------------#
        #   training step
        # ----------------------#
        if self.opts.local_rank == self.opts.main_gpuid:
            logger.info('start training!')
        all_loss = self.training_step(epoch)
        logger.info('Finish Train')
        
        if self.opts.local_rank == self.opts.main_gpuid:
            self.append_loss(all_loss.item(), 0)
            model_name = "epoch{:03d}-train_{:.3f}-val_{:.3f}".format(epoch, all_loss, 0)
            self.save_model(model_name, epoch, 0)

Complete code of 3D_train.py as follow:

import os
from turtle import pd
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

from calendar import EPOCH
from random import shuffle
from data.wj_dataset.data_loader import MultFrame_Dataset
from torch.utils.data import DataLoader
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from loguru import logger
from utils.opts import opts
from model.FSD_module import FSD_module
from model.loss.loss_factory import get_fusionloss
import torch.optim as optim
from utils.fit import fit_func
from utils.utils import *
import time
class trainer: #36s
    def __init__(self,opts):

        self.opts = opts
        self.log_dir = os.path.join(opts.output_dir, opts.experiment_name)
        set_log_dir(self.log_dir)
        #------------------------------------------------------#
        #   path
        #------------------------------------------------------#
        self.train_path = opts.training_data
        self.val_path = opts.validation_data
        #------------------------------------------------------#
        #   gpu
        #------------------------------------------------------#
        self.ngpus_per_node = 2#torch.cuda.device_count()
        if opts.distributed:
            #import pdb;pdb.set_trace()
            torch.cuda.set_device(opts.local_rank)
            dist.init_process_group(backend='nccl')
            self.global_rank = dist.get_rank()
            if opts.local_rank == opts.main_gpuid:
                logger.info( "[{os.getpid()}] (rank={self.global_rank}, local_rank={opts.local_rank}) training ...")
                logger.info("GPU device count : ", self.ngpus_per_node)
        else:
            opts.local_rank = opts.main_gpuid
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
         #------------------------------------------------------#
        #   dataload 
        #------------------------------------------------------#
        self.trainloader,self.valloader, self.max_epoch = \
            self.get_data_loader(self.train_path,self.val_path,opts,self.ngpus_per_node)
        
        #------------------------------------------------------#
        #   train model
        #------------------------------------------------------#
        self.model = FSD_module(opts)
        self.weights_init(self.model)
        #------------------------------------------------------#
        # resume training or not
        # ------------------------------------------------------#
        self.optimizer = optim.Adam(self.model.parameters(), opts.base_lr, weight_decay=opts.weight_decay)
        if opts.resume:
            model_path = os.path.join(opts.output_dir,opts.experiment_name,opts.model_path)
            self.resume_train(model_path, opts)
        else:
            self.start_epoch = opts.start_epoch
            
             
        if opts.distributed:
            self.model = DDP(self.model.to(opts.local_rank), device_ids=[opts.local_rank], output_device=opts.local_rank, find_unused_parameters=True, broadcast_buffers=False)
        else:
            self.model = self.model.to(self.device)
        
        cudnn.benchmark = True
        
        self.lr_scheduler = self.get_lr_scheduler(opts, self.max_epoch,self.start_epoch)
        #------------------------------------------------------#
        #    loss   log 
        #------------------------------------------------------#
        self.fusionLoss = get_fusionloss(opts.fusion_loss_arch)
        self.fit_func = fit_func(
            self.model,
            self.fusionLoss,
            self.optimizer,
            self.max_epoch,
            self.max_epoch,
            self.trainloader,
            self.valloader,
            self.lr_scheduler,
            opts,
        )
        
    def resume_train(self, model_path, opts):
        # load model
        if opts.local_rank == opts.main_gpuid:
            logger.info('Load weights {}.'.format(model_path))
        model_dict = self.model.state_dict()
        stat = torch.load(model_path)
        pretrained_dict = stat['model']
        load_key, no_load_key, temp_dict = [], [], {}
        for k, v in pretrained_dict.items():
            if opts.distributed:
                new_key = k[7:]
            else:
                new_key = k
            if np.shape(model_dict[new_key]) == np.shape(v):
                temp_dict[new_key] = v
                load_key.append(new_key)
            else:
                no_load_key.append(k)
        model_dict.update(temp_dict)
        self.model.load_state_dict(model_dict,False)
        # load epoch
        self.start_epoch = stat['epoch']
        if opts.local_rank == opts.main_gpuid:
            logger.info("\nSuccessful Load Key:", str(load_key)[:500], ".......\nSuccessful Load Key Num:", len(load_key))
            logger.info("\nFail To Load Key:", str(no_load_key)[:500], ".......\nFail To Load Key num:", len(no_load_key))
            logger.info("\n current epoch:{}".format(self.start_epoch))
     
        # load optimizer
        optimizer = stat['optimizer']
        
        self.optimizer.load_state_dict(optimizer)
        
        
        for state in self.optimizer.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.cuda()
                    
    def get_lr_scheduler(self, opts, iters_per_epoch, start_epoch):
        from utils.lr_scheduler import WarmupMultiStepLR
        if start_epoch == 0:
            last_epoch_param = -1
        else:
            last_epoch_param = start_epoch*iters_per_epoch
        warmup_epochs = opts.warmup_epochs
        lr_scheduler = WarmupMultiStepLR(
            self.optimizer,
            milestones=[10, 20],
            warmup_factor=0.001,
            warmup_epoch=warmup_epochs,
            iters_per_epoch=iters_per_epoch,
            last_epoch=last_epoch_param
        )
        return lr_scheduler

    def get_data_loader(self,train_path,val_path,opts,ngpus_per_node):
        train_list = os.listdir(train_path+'/bin/')
        val_list = os.listdir(val_path+'/bin/')
        batch = opts.batch_size
        num_train = len(train_list)
        num_val = len(val_list)
        # for i in range(num_train):
        #     train_list_queue = train_path+'/bin/' + '%d.bin'%i
        #     print('train name:',train_list)
        # for i in range(num_val):
        #     val_list_queue = val_path+'/bin/' + '%d.bin'%i
        max_epoch_step = num_train // batch
        #import pdb;pdb.set_trace()
        max_epoch_step_val = num_val // batch

        train_dataset=MultFrame_Dataset(train_path,train_list,opts)
        val_dataset = MultFrame_Dataset(val_path,val_list,opts)

        if opts.distributed:
            train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
            val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
            batch_size_train = opts.batch_size // ngpus_per_node
            batch_size_val = opts.batch_size // ngpus_per_node
            shuffle = False
        else:
            train_sampler = None
            val_sampler = None
            batch_size_train = batch
            batch_size_val = opts.batch_size
            shuffle = True

        Train_loader = DataLoader(
            train_dataset,
            shuffle=shuffle,
            batch_size=batch_size_train,
            #num_workers=2,
            pin_memory=True,
            drop_last=True,
            collate_fn=MultFrame_Dataset.dataset_collate,
            sampler=train_sampler
        )
        Val_loader = DataLoader(
            val_dataset,
            shuffle=shuffle,
            batch_size=batch_size_val,
            #num_workers=2,
            pin_memory=True,
            drop_last=True,
            collate_fn=MultFrame_Dataset.dataset_collate,
            sampler=val_sampler
        )

        return Train_loader ,Val_loader, max_epoch_step


    def weights_init(self, net, init_type='normal', init_gain=0.02):
        def init_func(m):
            classname = m.__class__.__name__
            if hasattr(m, 'weight') and classname.find('Conv') != -1:
                if init_type == 'normal':
                    torch.nn.init.normal_(m.weight.data, 0.0, init_gain)
                elif init_type == 'xavier':
                    torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain)
                elif init_type == 'kaiming':
                    torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
                elif init_type == 'orthogonal':
                    torch.nn.init.orthogonal_(m.weight.data, gain=init_gain)
                else:
                    raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
            elif classname.find('BatchNorm2d') != -1:
                torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
                torch.nn.init.constant_(m.bias.data, 0.0)
        logger.info('initialize network with %s type' % init_type)
        #print('initialize network with %s type' % init_type)
        net.apply(init_func)
    def train(self):
        #------------------------------------------------------#
        #   fit
        #------------------------------------------------------#
        for epoch in range(self.start_epoch, self.opts.end_epoch):
            if self.opts.distributed:
                self.trainloader.sampler.set_epoch(epoch)
            self.fit_func.fit(epoch)


        
if __name__ == "__main__":
    opt = opts().parse()
    trainer = trainer(opt)
    trainer.train()

What I exactly need is the definition of self.loss_log from self.loss_log(all_loss_train)
Additionally, in your code, it is better to detach the computational graph from the tensor and move it to cpu by tensor.detach().cpu() for those tensors which are not required to run backpropagation.

For example

logger.error(pred_cls_map[...,gt_mask==1].detach().cpu())

Problem solved, it seems is the data problem.