ok,i used 6 loss to get a fusion loss,and fusion loss code is the following:
class fusion_loss_base(nn.Module):
def __init__(self):
super().__init__()
self.heatmap_loss = FocalLoss()
self.offset_loss = RegLossL1()
self.size_loss = RegLossL1()
self.orientation_loss = multi_bin_loss()
self.cls_loss = cls_loss()
self.iou_loss = iou_loss()
pass
def forward(self, prediction, groundTruth, opt):
# prediction: (location, x_offset, y_offset, W, H, orientation[bin1,bin2,sin,cos], class)
#label bbox: [id, center_x, center_y, width, length, angle,class]
# 0 bus
# 1 car
# 2 bicycle
# 3 pedestrian
# 4 tricycle
# 5 semitrailer
# 6 truck
[_,_,width,_] = prediction.shape #80 40 20 channel=16
bs = len(groundTruth)
scale = 640/width #8
# loss all, loss heatmap, loss offset, loss size, loss orientation, loss class, loss iou, ...
loss_arr = torch.zeros([1,10]).cuda()
#import pdb;pdb.set_trace()
heatmap_num=0
for i in range(bs):
batch_target = groundTruth[i]
[gt_map, gt_mask] = gene_multiScaleGTmap(batch_target, scale,opt) #17sec
# gt_map=[ 0 gaussian_map
# 1 offset_x
# 2 offset_y
# 3 z
# 4 width
# 5 length
# 6 height
# 7 angle <pi or not,if <pi value=0 else value=1
# 8 angle
# 9 class ]
gt_map = gt_map.cuda()
gt_mask = gt_mask.cuda()
if gt_mask.sum() == 0:
break
# -----------------------------#
# key point heatmap loss
# -----------------------------#
pred_heatmap = torch.sigmoid(prediction[i,0,:,:])
gt_heatmap = gt_map[0,0,:,:]
#import pdb;pdb.set_trace()
loss_arr[0,1] += self.heatmap_loss(pred_heatmap, gt_heatmap)
# -----------------------------#
# x,y offset loss
# -----------------------------#
pred_offset_map = scale*torch.sigmoid(prediction[i,1:3,:,:])
gt_offset_map = gt_map[0,1:3,:,:]
loss_arr[0,2] += 0.01*self.offset_loss(pred_offset_map, gt_offset_map, gt_mask)
# -----------------------------#
# z offset loss
# -----------------------------#
pred_offset_map = prediction[i,3,:,:]
gt_offset_map = gt_map[0,3,:,:]
loss_arr[0,3] += 0.01*self.offset_loss(pred_offset_map, gt_offset_map, gt_mask)
# -----------------------------#
# height, width loss
# -----------------------------#
if opt.chooseLoss == 0:
pred_size_map = prediction[i,4:7,:,:]
if opt.chooseLoss == 0 or opt.chooseLoss == 1 or opt.chooseLoss == 3:
gt_size_map = gt_map[0,4:7,:,:]
loss_arr[0,4] += 0.01*self.size_loss(pred_size_map, gt_size_map, gt_mask)
# -----------------------------#
# orientation loss
# [bin1,bin2,offset]
# -----------------------------#
pred_orientation_map = prediction[i,7:9,:,:]
gt_orientation_map = gt_map[0,7:9,:,:]
angle_bin_loss, angle_offset_loss = self.orientation_loss(pred_orientation_map, gt_orientation_map, gt_mask)
loss_arr[0,5] += 0.1*angle_bin_loss
loss_arr[0,6] += 0.1*angle_offset_loss
# -----------------------------#
# class loss
# [bin1,bin2,bin3,bin4,bin5,bin6,bin7]
# -----------------------------#
pred_cls_map = prediction[i,9:,:,:]
gt_cls_map = gt_map[0,9,:,:]
try:
loss_arr[0,7] += 0.1*self.cls_loss(pred_cls_map, gt_cls_map, gt_mask)
#import pdb;pdb.set_trace()
except:
logger.error(pred_cls_map[...,gt_mask==1])
print(gt_cls_map[...,gt_mask==1].shape)
import pdb;pdb.set_trace()
loss_arr[0, 0] += loss_arr[0, 1:].sum()
loss_arr = loss_arr/bs
# loss all, loss heatmap, loss offset xy, loss z, loss size, loss orientation, loss class
return loss_arr
And this is complete code of fit.py
:
# -- coding: utf-8 --
from turtle import pd
from tqdm import tqdm
import numpy as np
import torch
import matplotlib.pyplot as plt
from evaluator.utils_mAP import multi_channel_object_decode
from pycocotools.coco import COCO
from collections import defaultdict
import copy
from evaluator.MVDNet_mAPtools import RobotCarCOCOeval
from loguru import logger
from utils.allreduce_norm import all_reduce_norm
from utils.dist import gather, synchronize
import os
import itertools
import time
class fit_func():
def __init__(self,
model,
fusionLoss,
optimizer,
train_all_iter,
val_all_iter,
train_loader,
val_loader,
lr_scheduler,
opts,
):
self.model = model
self.fusionLoss = fusionLoss
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
self.train_all_iter = train_all_iter
self.val_all_iter = val_all_iter
self.iter = 0
self.train_loader = train_loader
self.val_loader = val_loader
self.opts = opts
self.log_path = os.path.join(opts.output_dir, opts.experiment_name)
self.start_epoch = opts.start_epoch
self.end_epoch = opts.end_epoch
self.bbox_generator = multi_channel_object_decode()
logger.info("args: {}".format(opts))
def save_checkpoint(self, state, save_dir, model_name="final_model"):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
filename = os.path.join(save_dir, model_name + "_ckpt.pth")
print('save checkpoint '+filename)
torch.save(state, filename)
def save_model(self, model_name, epoch, ap_50):
state = {
"epoch": epoch + 1,
"model": self.model.state_dict(),
'optimizer': self.optimizer.state_dict(),
'ap_50': ap_50
}
self.save_checkpoint(state, self.log_path, model_name)
def append_loss(self, train_loss, val_mAP):
with open(os.path.join(self.opts.output_dir, self.opts.experiment_name, "epoch_loss" + ".txt"), 'a') as f:
f.write(str(train_loss))
f.write("\n")
with open(os.path.join(self.opts.output_dir, self.opts.experiment_name, "epoch_val_mAP" + ".txt"), 'a') as f:
f.write(str(val_mAP))
f.write("\n")
def loss_log(self, loss_arr):
loss_dict = {
'loss_all': loss_arr[0, 0].item(),
'heatmap_loss': loss_arr[0, 1].item(),
'xy_offset_loss': loss_arr[0, 2].item(),
'z_offset_loss': loss_arr[0, 3].item(),
'wlh_loss': loss_arr[0, 4].item(),
'angle_bin': loss_arr[0, 5].item(),
'angle_offset': loss_arr[0, 6].item(),
'cls_loss': loss_arr[0, 7].item()
}
loss_str = ", ".join(
["{}: {:.3f}".format(k, v) for k, v in loss_dict.items()]
)
if self.opts.local_rank == self.opts.main_gpuid:
logger.info(loss_str)
#print(loss_str)
def creat_coco(self):
# ----------------------#
# coco mAP
# ----------------------#
coco_gt = COCO()
coco_gt.dataset = dict()
coco_gt.anns = dict()
coco_gt.cats = dict()
coco_gt.imgs = dict()
coco_gt.imgToAnns = defaultdict(list)
coco_gt.catToImgs = defaultdict(list)
coco_gt.dataset["images"] = []
coco_gt.dataset["categories"] = []
category = dict()
category["supercategory"] = "vehicle"
category["id"] = 1
category["name"] = "car"
coco_gt.dataset["categories"].append(category)
coco_gt.dataset["annotations"] = []
return coco_gt
def get_lr(self):
for param_group in self.optimizer.param_groups:
return param_group['lr']
@property
def progress_in_iter(self):
return self.epoch * self.train_all_iter + self.iter
def training_step(self, epoch):
# batch*pillars*N*D
all_loss_train = torch.zeros((1,10)).cuda()
iter_now=0
for self.iter, batch in enumerate(self.train_loader):
if self.iter >= self.train_all_iter:
break
iter_now+=1
# start_time = time.time()
self.model.train()
#lidar_pillar list(numpy(9,49,)
#boxes list(numpy(box),numpy(box))
lidar_pillar, boxes = batch[0], batch[1]
lidar_pillar = torch.from_numpy(lidar_pillar).type(torch.FloatTensor).cuda()
boxes = [torch.from_numpy(ann).type(torch.FloatTensor).cuda() for ann in boxes]
#
self.optimizer.zero_grad()
# forward
#outpu tuple(out3(2,11,88,88),ou4(2,11,44,44),out5(2,11,22,22))
output = self.model(lidar_pillar, self.opts) # time cost : 0.03377 sec
#import pdb;pdb.set_trace()
# -------------------
# loss
# -------------------
# loss all, loss heatmap, loss offset, loss size, loss orientation, loss class, loss iou, ...
loss_train_arr = torch.zeros([1, 10]).cuda()
if self.opts.using_multi_scale == 1:
num_output = len(output)
else:
num_output = 1
for i in range(num_output):
loss_train_arr += self.fusionLoss(output[i], boxes, self.opts)#time cost : 22.08293 sec
loss_train_arr = loss_train_arr / num_output
#loss_train_arr([1,10])
#import pdb;pdb.set_trace()
# ----------------------#
# backward
# ----------------------#
if loss_train_arr[0, 0].item() != 0: #time cost : 0.05921 sec
loss_train_arr[0, 0].backward()
#torch.nn.utils.clip_grad_norm_(parameters=self.model.parameters(), max_norm=1, norm_type=2)
self.loss_log(loss_train_arr)
if self.opts.local_rank == self.opts.main_gpuid:
progress_str = "epoch: {}/{}, iter: {}/{} optim_lr: {}, sche_lr: {}".format(
epoch + 1, self.end_epoch, self.iter + 1, self.train_all_iter, self.optimizer.param_groups[0]['lr'], self.get_lr()
)
# import pdb;pdb.set_trace()
logger.info(self.optimizer.param_groups[0]['params'][0])
logger.info("{}".format(progress_str))
#print("{}".format(progress_str))
self.optimizer.step()
self.lr_scheduler.step()
if np.isnan(loss_train_arr[0,0].item()):
continue
# collect all loss
all_loss_train += loss_train_arr
# end = time.time()
# running_time = end-start_time
# print('time cost : %.5f sec' %running_time)
# get average loss
all_loss_train = all_loss_train/self.train_all_iter
self.loss_log(all_loss_train)
return all_loss_train[0,0]
def fit(self, epoch):
self.epoch = epoch
# ----------------------#
# training step
# ----------------------#
if self.opts.local_rank == self.opts.main_gpuid:
logger.info('start training!')
all_loss = self.training_step(epoch)
logger.info('Finish Train')
if self.opts.local_rank == self.opts.main_gpuid:
self.append_loss(all_loss.item(), 0)
model_name = "epoch{:03d}-train_{:.3f}-val_{:.3f}".format(epoch, all_loss, 0)
self.save_model(model_name, epoch, 0)
Complete code of 3D_train.py
as follow:
import os
from turtle import pd
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
from calendar import EPOCH
from random import shuffle
from data.wj_dataset.data_loader import MultFrame_Dataset
from torch.utils.data import DataLoader
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from loguru import logger
from utils.opts import opts
from model.FSD_module import FSD_module
from model.loss.loss_factory import get_fusionloss
import torch.optim as optim
from utils.fit import fit_func
from utils.utils import *
import time
class trainer: #36s
def __init__(self,opts):
self.opts = opts
self.log_dir = os.path.join(opts.output_dir, opts.experiment_name)
set_log_dir(self.log_dir)
#------------------------------------------------------#
# path
#------------------------------------------------------#
self.train_path = opts.training_data
self.val_path = opts.validation_data
#------------------------------------------------------#
# gpu
#------------------------------------------------------#
self.ngpus_per_node = 2#torch.cuda.device_count()
if opts.distributed:
#import pdb;pdb.set_trace()
torch.cuda.set_device(opts.local_rank)
dist.init_process_group(backend='nccl')
self.global_rank = dist.get_rank()
if opts.local_rank == opts.main_gpuid:
logger.info( "[{os.getpid()}] (rank={self.global_rank}, local_rank={opts.local_rank}) training ...")
logger.info("GPU device count : ", self.ngpus_per_node)
else:
opts.local_rank = opts.main_gpuid
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#------------------------------------------------------#
# dataload
#------------------------------------------------------#
self.trainloader,self.valloader, self.max_epoch = \
self.get_data_loader(self.train_path,self.val_path,opts,self.ngpus_per_node)
#------------------------------------------------------#
# train model
#------------------------------------------------------#
self.model = FSD_module(opts)
self.weights_init(self.model)
#------------------------------------------------------#
# resume training or not
# ------------------------------------------------------#
self.optimizer = optim.Adam(self.model.parameters(), opts.base_lr, weight_decay=opts.weight_decay)
if opts.resume:
model_path = os.path.join(opts.output_dir,opts.experiment_name,opts.model_path)
self.resume_train(model_path, opts)
else:
self.start_epoch = opts.start_epoch
if opts.distributed:
self.model = DDP(self.model.to(opts.local_rank), device_ids=[opts.local_rank], output_device=opts.local_rank, find_unused_parameters=True, broadcast_buffers=False)
else:
self.model = self.model.to(self.device)
cudnn.benchmark = True
self.lr_scheduler = self.get_lr_scheduler(opts, self.max_epoch,self.start_epoch)
#------------------------------------------------------#
# loss log
#------------------------------------------------------#
self.fusionLoss = get_fusionloss(opts.fusion_loss_arch)
self.fit_func = fit_func(
self.model,
self.fusionLoss,
self.optimizer,
self.max_epoch,
self.max_epoch,
self.trainloader,
self.valloader,
self.lr_scheduler,
opts,
)
def resume_train(self, model_path, opts):
# load model
if opts.local_rank == opts.main_gpuid:
logger.info('Load weights {}.'.format(model_path))
model_dict = self.model.state_dict()
stat = torch.load(model_path)
pretrained_dict = stat['model']
load_key, no_load_key, temp_dict = [], [], {}
for k, v in pretrained_dict.items():
if opts.distributed:
new_key = k[7:]
else:
new_key = k
if np.shape(model_dict[new_key]) == np.shape(v):
temp_dict[new_key] = v
load_key.append(new_key)
else:
no_load_key.append(k)
model_dict.update(temp_dict)
self.model.load_state_dict(model_dict,False)
# load epoch
self.start_epoch = stat['epoch']
if opts.local_rank == opts.main_gpuid:
logger.info("\nSuccessful Load Key:", str(load_key)[:500], ".......\nSuccessful Load Key Num:", len(load_key))
logger.info("\nFail To Load Key:", str(no_load_key)[:500], ".......\nFail To Load Key num:", len(no_load_key))
logger.info("\n current epoch:{}".format(self.start_epoch))
# load optimizer
optimizer = stat['optimizer']
self.optimizer.load_state_dict(optimizer)
for state in self.optimizer.state.values():
for k, v in state.items():
if torch.is_tensor(v):
state[k] = v.cuda()
def get_lr_scheduler(self, opts, iters_per_epoch, start_epoch):
from utils.lr_scheduler import WarmupMultiStepLR
if start_epoch == 0:
last_epoch_param = -1
else:
last_epoch_param = start_epoch*iters_per_epoch
warmup_epochs = opts.warmup_epochs
lr_scheduler = WarmupMultiStepLR(
self.optimizer,
milestones=[10, 20],
warmup_factor=0.001,
warmup_epoch=warmup_epochs,
iters_per_epoch=iters_per_epoch,
last_epoch=last_epoch_param
)
return lr_scheduler
def get_data_loader(self,train_path,val_path,opts,ngpus_per_node):
train_list = os.listdir(train_path+'/bin/')
val_list = os.listdir(val_path+'/bin/')
batch = opts.batch_size
num_train = len(train_list)
num_val = len(val_list)
# for i in range(num_train):
# train_list_queue = train_path+'/bin/' + '%d.bin'%i
# print('train name:',train_list)
# for i in range(num_val):
# val_list_queue = val_path+'/bin/' + '%d.bin'%i
max_epoch_step = num_train // batch
#import pdb;pdb.set_trace()
max_epoch_step_val = num_val // batch
train_dataset=MultFrame_Dataset(train_path,train_list,opts)
val_dataset = MultFrame_Dataset(val_path,val_list,opts)
if opts.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
batch_size_train = opts.batch_size // ngpus_per_node
batch_size_val = opts.batch_size // ngpus_per_node
shuffle = False
else:
train_sampler = None
val_sampler = None
batch_size_train = batch
batch_size_val = opts.batch_size
shuffle = True
Train_loader = DataLoader(
train_dataset,
shuffle=shuffle,
batch_size=batch_size_train,
#num_workers=2,
pin_memory=True,
drop_last=True,
collate_fn=MultFrame_Dataset.dataset_collate,
sampler=train_sampler
)
Val_loader = DataLoader(
val_dataset,
shuffle=shuffle,
batch_size=batch_size_val,
#num_workers=2,
pin_memory=True,
drop_last=True,
collate_fn=MultFrame_Dataset.dataset_collate,
sampler=val_sampler
)
return Train_loader ,Val_loader, max_epoch_step
def weights_init(self, net, init_type='normal', init_gain=0.02):
def init_func(m):
classname = m.__class__.__name__
if hasattr(m, 'weight') and classname.find('Conv') != -1:
if init_type == 'normal':
torch.nn.init.normal_(m.weight.data, 0.0, init_gain)
elif init_type == 'xavier':
torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain)
elif init_type == 'kaiming':
torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
elif init_type == 'orthogonal':
torch.nn.init.orthogonal_(m.weight.data, gain=init_gain)
else:
raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
elif classname.find('BatchNorm2d') != -1:
torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
torch.nn.init.constant_(m.bias.data, 0.0)
logger.info('initialize network with %s type' % init_type)
#print('initialize network with %s type' % init_type)
net.apply(init_func)
def train(self):
#------------------------------------------------------#
# fit
#------------------------------------------------------#
for epoch in range(self.start_epoch, self.opts.end_epoch):
if self.opts.distributed:
self.trainloader.sampler.set_epoch(epoch)
self.fit_func.fit(epoch)
if __name__ == "__main__":
opt = opts().parse()
trainer = trainer(opt)
trainer.train()