I am using this code for object detection (source).
The code is written to be run on one gpu, but it has the option of multiple gpus via nn.Dataparallel as well.
Unfortunately even when i uncomment the nn.Dataparallel it still use one GPU.
Can anyone give me any suggestion on what can be wrong
I uncomment the DataParallel command in the following codes:
from __future__ import print_function
import numpy as np
import torch
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
from lib.layers import *
from lib.utils.timer import Timer
from lib.utils.data_augment import preproc
from lib.modeling.model_builder import create_model
from lib.utils.config_parse import cfg
class ObjectDetector:
def __init__(self, viz_arch=False):
self.cfg = cfg
# Build model
print('===> Building model')
self.model, self.priorbox = create_model(cfg.MODEL)
self.priors = Variable(self.priorbox.forward(), volatile=True)
# Print the model architecture and parameters
if viz_arch is True:
print('Model architectures:\n{}\n'.format(self.model))
# Utilize GPUs for computation
self.use_gpu = torch.cuda.is_available()
self.half = False
if self.use_gpu:
print('Utilize GPUs for computation')
print('Number of GPU available', torch.cuda.device_count())
self.model.cuda()
self.priors.cuda()
cudnn.benchmark = True
self.model = torch.nn.DataParallel(self.model).module
# Utilize half precision
self.half = cfg.MODEL.HALF_PRECISION
if self.half:
self.model = self.model.half()
self.priors = self.priors.half()
# Build preprocessor and detector
self.preprocessor = preproc(cfg.MODEL.IMAGE_SIZE, cfg.DATASET.PIXEL_MEANS, -2)
self.detector = Detect(cfg.POST_PROCESS, self.priors)
# Load weight:
if cfg.RESUME_CHECKPOINT == '':
AssertionError('RESUME_CHECKPOINT can not be empty')
print('=> loading checkpoint {:s}'.format(cfg.RESUME_CHECKPOINT))
checkpoint = torch.load(cfg.RESUME_CHECKPOINT)
# checkpoint = torch.load(cfg.RESUME_CHECKPOINT, map_location='gpu' if self.use_gpu else 'cpu')
self.model.load_state_dict(checkpoint)
# test only
self.model.eval()
def predict(self, img, threshold=0.6, check_time=False):
# make sure the input channel is 3
assert img.shape[2] == 3
scale = torch.Tensor([img.shape[1::-1], img.shape[1::-1]])
_t = {'preprocess': Timer(), 'net_forward': Timer(), 'detect': Timer(), 'output': Timer()}
# preprocess image
_t['preprocess'].tic()
x = Variable(self.preprocessor(img)[0].unsqueeze(0))
if self.use_gpu:
x = x.cuda()
if self.half:
x = x.half()
preprocess_time = _t['preprocess'].toc()
# forward
_t['net_forward'].tic()
out = self.model(x) # forward pass
net_forward_time = _t['net_forward'].toc()
# detect
_t['detect'].tic()
detections = self.detector.forward(out)
detect_time = _t['detect'].toc()
# output
_t['output'].tic()
labels, scores, coords = [list() for _ in range(3)]
# for batch in range(detections.size(0)):
# print('Batch:', batch)
batch=0
for classes in range(detections.size(1)):
num = 0
while detections[batch,classes,num,0] >= threshold:
scores.append(detections[batch,classes,num,0])
labels.append(classes-1)
coords.append(detections[batch,classes,num,1:]*scale)
num+=1
output_time = _t['output'].toc()
total_time = preprocess_time + net_forward_time + detect_time + output_time
if check_time is True:
return labels, scores, coords, (total_time, preprocess_time, net_forward_time, detect_time, output_time)
# total_time = preprocess_time + net_forward_time + detect_time + output_time
# print('total time: {} \n preprocess: {} \n net_forward: {} \n detect: {} \n output: {}'.format(
# total_time, preprocess_time, net_forward_time, detect_time, output_time
# ))
return labels, scores, coords
from __future__ import print_function
import numpy as np
import os
import sys
import cv2
import random
import pickle
import torch
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.utils.data as data
import torch.nn.init as init
from tensorboardX import SummaryWriter
from lib.layers import *
from lib.utils.timer import Timer
from lib.utils.data_augment import preproc
from lib.modeling.model_builder import create_model
from lib.dataset.dataset_factory import load_data
from lib.utils.config_parse import cfg
from lib.utils.eval_utils import *
from lib.utils.visualize_utils import *
class Solver(object):
"""
A wrapper class for the training process
"""
def __init__(self):
self.cfg = cfg
# Load data
print('===> Loading data')
self.train_loader = load_data(cfg.DATASET, 'train') if 'train' in cfg.PHASE else None
self.eval_loader = load_data(cfg.DATASET, 'eval') if 'eval' in cfg.PHASE else None
self.test_loader = load_data(cfg.DATASET, 'test') if 'test' in cfg.PHASE else None
self.visualize_loader = load_data(cfg.DATASET, 'visualize') if 'visualize' in cfg.PHASE else None
# Build model
print('===> Building model')
self.model, self.priorbox = create_model(cfg.MODEL)
self.priors = Variable(self.priorbox.forward())
self.detector = Detect(cfg.POST_PROCESS, self.priors)
# Utilize GPUs for computation
self.use_gpu = torch.cuda.is_available()
if self.use_gpu:
print('Utilize GPUs for computation')
print('Number of GPU available', torch.cuda.device_count())
self.model.cuda()
self.priors.cuda()
cudnn.benchmark = True
if torch.cuda.device_count() > 1:
self.model = torch.nn.DataParallel(self.model,device_ids=[0, 1]).module
# Print the model architecture and parameters
print('Model architectures:\n{}\n'.format(self.model))
# print('Parameters and size:')
# for name, param in self.model.named_parameters():
# print('{}: {}'.format(name, list(param.size())))
# print trainable scope
print('Trainable scope: {}'.format(cfg.TRAIN.TRAINABLE_SCOPE))
trainable_param = self.trainable_param(cfg.TRAIN.TRAINABLE_SCOPE)
self.optimizer = self.configure_optimizer(trainable_param, cfg.TRAIN.OPTIMIZER)
self.exp_lr_scheduler = self.configure_lr_scheduler(self.optimizer, cfg.TRAIN.LR_SCHEDULER)
self.max_epochs = cfg.TRAIN.MAX_EPOCHS
# metric
self.criterion = MultiBoxLoss(cfg.MATCHER, self.priors, self.use_gpu)
# Set the logger
self.writer = SummaryWriter(log_dir=cfg.LOG_DIR)
self.output_dir = cfg.EXP_DIR
self.checkpoint = cfg.RESUME_CHECKPOINT
self.checkpoint_prefix = cfg.CHECKPOINTS_PREFIX
def save_checkpoints(self, epochs, iters=None):
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
if iters:
filename = self.checkpoint_prefix + '_epoch_{:d}_iter_{:d}'.format(epochs, iters) + '.pth'
else:
filename = self.checkpoint_prefix + '_epoch_{:d}'.format(epochs) + '.pth'
filename = os.path.join(self.output_dir, filename)
torch.save(self.model.state_dict(), filename)
with open(os.path.join(self.output_dir, 'checkpoint_list.txt'), 'a') as f:
f.write('epoch {epoch:d}: {filename}\n'.format(epoch=epochs, filename=filename))
print('Wrote snapshot to: {:s}'.format(filename))
# TODO: write relative cfg under the same page
def resume_checkpoint(self, resume_checkpoint):
if resume_checkpoint == '' or not os.path.isfile(resume_checkpoint):
print(("=> no checkpoint found at '{}'".format(resume_checkpoint)))
return False
print(("=> loading checkpoint '{:s}'".format(resume_checkpoint)))
checkpoint = torch.load(resume_checkpoint)
# print("=> Weigths in the checkpoints:")
# print([k for k, v in list(checkpoint.items())])
# remove the module in the parrallel model
if 'module.' in list(checkpoint.items())[0][0]:
pretrained_dict = {'.'.join(k.split('.')[1:]): v for k, v in list(checkpoint.items())}
checkpoint = pretrained_dict
resume_scope = self.cfg.TRAIN.RESUME_SCOPE
# extract the weights based on the resume scope
if resume_scope != '':
pretrained_dict = {}
for k, v in list(checkpoint.items()):
for resume_key in resume_scope.split(','):
if resume_key in k:
pretrained_dict[k] = v
break
checkpoint = pretrained_dict
pretrained_dict = {k: v for k, v in checkpoint.items() if k in self.model.state_dict()}
# print("=> Resume weigths:")
# print([k for k, v in list(pretrained_dict.items())])
checkpoint = self.model.state_dict()
unresume_dict = set(checkpoint)-set(pretrained_dict)
if len(unresume_dict) != 0:
print("=> UNResume weigths:")
print(unresume_dict)
checkpoint.update(pretrained_dict)
return self.model.load_state_dict(checkpoint)
def find_previous(self):
if not os.path.exists(os.path.join(self.output_dir, 'checkpoint_list.txt')):
return False
with open(os.path.join(self.output_dir, 'checkpoint_list.txt'), 'r') as f:
lineList = f.readlines()
epoches, resume_checkpoints = [list() for _ in range(2)]
for line in lineList:
epoch = int(line[line.find('epoch ') + len('epoch '): line.find(':')])
checkpoint = line[line.find(':') + 2:-1]
epoches.append(epoch)
resume_checkpoints.append(checkpoint)
return epoches, resume_checkpoints
def weights_init(self, m):
for key in m.state_dict():
if key.split('.')[-1] == 'weight':
if 'conv' in key:
init.kaiming_normal(m.state_dict()[key], mode='fan_out')
if 'bn' in key:
m.state_dict()[key][...] = 1
elif key.split('.')[-1] == 'bias':
m.state_dict()[key][...] = 0
def initialize(self):
if self.checkpoint:
print('Loading initial model weights from {:s}'.format(self.checkpoint))
self.resume_checkpoint(self.checkpoint)
start_epoch = 0
return start_epoch
def trainable_param(self, trainable_scope):
for param in self.model.parameters():
param.requires_grad = False
trainable_param = []
for module in trainable_scope.split(','):
if hasattr(self.model, module):
# print(getattr(self.model, module))
for param in getattr(self.model, module).parameters():
param.requires_grad = True
trainable_param.extend(getattr(self.model, module).parameters())
return trainable_param
def train_model(self):
previous = self.find_previous()
if previous:
start_epoch = previous[0][-1]
self.resume_checkpoint(previous[1][-1])
else:
start_epoch = self.initialize()
# export graph for the model, onnx always not works
# self.export_graph()
# warm_up epoch
warm_up = self.cfg.TRAIN.LR_SCHEDULER.WARM_UP_EPOCHS
for epoch in iter(range(start_epoch+1, self.max_epochs+1)):
#learning rate
sys.stdout.write('\rEpoch {epoch:d}/{max_epochs:d}:\n'.format(epoch=epoch, max_epochs=self.max_epochs))
if epoch > warm_up:
self.exp_lr_scheduler.step(epoch-warm_up)
if 'train' in cfg.PHASE:
self.train_epoch(self.model, self.train_loader, self.optimizer, self.criterion, self.writer, epoch, self.use_gpu)
if 'eval' in cfg.PHASE:
self.eval_epoch(self.model, self.eval_loader, self.detector, self.criterion, self.writer, epoch, self.use_gpu)
if 'test' in cfg.PHASE:
self.test_epoch(self.model, self.test_loader, self.detector, self.output_dir, self.use_gpu)
if 'visualize' in cfg.PHASE:
self.visualize_epoch(self.model, self.visualize_loader, self.priorbox, self.writer, epoch, self.use_gpu)
if epoch % cfg.TRAIN.CHECKPOINTS_EPOCHS == 0:
self.save_checkpoints(epoch)
def test_model(self):
previous = self.find_previous()
if previous:
for epoch, resume_checkpoint in zip(previous[0], previous[1]):
if self.cfg.TEST.TEST_SCOPE[0] <= epoch <= self.cfg.TEST.TEST_SCOPE[1]:
sys.stdout.write('\rEpoch {epoch:d}/{max_epochs:d}:\n'.format(epoch=epoch, max_epochs=self.cfg.TEST.TEST_SCOPE[1]))
self.resume_checkpoint(resume_checkpoint)
if 'eval' in cfg.PHASE:
self.eval_epoch(self.model, self.eval_loader, self.detector, self.criterion, self.writer, epoch, self.use_gpu)
if 'test' in cfg.PHASE:
self.test_epoch(self.model, self.test_loader, self.detector, self.output_dir , self.use_gpu)
if 'visualize' in cfg.PHASE:
self.visualize_epoch(self.model, self.visualize_loader, self.priorbox, self.writer, epoch, self.use_gpu)
else:
sys.stdout.write('\rCheckpoint {}:\n'.format(self.checkpoint))
self.resume_checkpoint(self.checkpoint)
if 'eval' in cfg.PHASE:
self.eval_epoch(self.model, self.eval_loader, self.detector, self.criterion, self.writer, 0, self.use_gpu)
if 'test' in cfg.PHASE:
self.test_epoch(self.model, self.test_loader, self.detector, self.output_dir , self.use_gpu)
if 'visualize' in cfg.PHASE:
self.visualize_epoch(self.model, self.visualize_loader, self.priorbox, self.writer, 0, self.use_gpu)
def train_epoch(self, model, data_loader, optimizer, criterion, writer, epoch, use_gpu):
model.train()
epoch_size = len(data_loader)
batch_iterator = iter(data_loader)
loc_loss = 0
conf_loss = 0
_t = Timer()
for iteration in iter(range((epoch_size))):
images, targets = next(batch_iterator)
if use_gpu:
images = Variable(images.cuda())
targets = [Variable(anno.cuda()) for anno in targets]
else:
images = Variable(images)
targets = [Variable(anno) for anno in targets]
_t.tic()
# forward
out = model(images, phase='train')
# backprop
optimizer.zero_grad()
loss_l, loss_c = criterion(out, targets)
# some bugs in coco train2017. maybe the annonation bug.
if loss_l.data[0] == float("Inf"):
continue
loss = loss_l + loss_c
loss.backward()
optimizer.step()
time = _t.toc()
loc_loss += loss_l.data[0]
conf_loss += loss_c.data[0]
# log per iter
log = '\r==>Train: || {iters:d}/{epoch_size:d} in {time:.3f}s [{prograss}] || loc_loss: {loc_loss:.4f} cls_loss: {cls_loss:.4f}\r'.format(
prograss='#'*int(round(10*iteration/epoch_size)) + '-'*int(round(10*(1-iteration/epoch_size))), iters=iteration, epoch_size=epoch_size,
time=time, loc_loss=loss_l.data[0], cls_loss=loss_c.data[0])
sys.stdout.write(log)
sys.stdout.flush()
# log per epoch
sys.stdout.write('\r')
sys.stdout.flush()
lr = optimizer.param_groups[0]['lr']
log = '\r==>Train: || Total_time: {time:.3f}s || loc_loss: {loc_loss:.4f} conf_loss: {conf_loss:.4f} || lr: {lr:.6f}\n'.format(lr=lr,
time=_t.total_time, loc_loss=loc_loss/epoch_size, conf_loss=conf_loss/epoch_size)
sys.stdout.write(log)
sys.stdout.flush()
# log for tensorboard
writer.add_scalar('Train/loc_loss', loc_loss/epoch_size, epoch)
writer.add_scalar('Train/conf_loss', conf_loss/epoch_size, epoch)
writer.add_scalar('Train/lr', lr, epoch)
def eval_epoch(self, model, data_loader, detector, criterion, writer, epoch, use_gpu):
model.eval()
epoch_size = len(data_loader)
batch_iterator = iter(data_loader)
loc_loss = 0
conf_loss = 0
_t = Timer()
label = [list() for _ in range(model.num_classes)]
gt_label = [list() for _ in range(model.num_classes)]
score = [list() for _ in range(model.num_classes)]
size = [list() for _ in range(model.num_classes)]
npos = [0] * model.num_classes
for iteration in iter(range((epoch_size))):
# for iteration in iter(range((10))):
images, targets = next(batch_iterator)
if use_gpu:
images = Variable(images.cuda())
targets = [Variable(anno.cuda()) for anno in targets]
else:
images = Variable(images)
targets = [Variable(anno) for anno in targets]
_t.tic()
# forward
out = model(images, phase='train')
# loss
loss_l, loss_c = criterion(out, targets)
out = (out[0], model.softmax(out[1].view(-1, model.num_classes)))
# detect
detections = detector.forward(out)
time = _t.toc()
# evals
label, score, npos, gt_label = cal_tp_fp(detections, targets, label, score, npos, gt_label)
size = cal_size(detections, targets, size)
loc_loss += loss_l.data[0]
conf_loss += loss_c.data[0]
# log per iter
log = '\r==>Eval: || {iters:d}/{epoch_size:d} in {time:.3f}s [{prograss}] || loc_loss: {loc_loss:.4f} cls_loss: {cls_loss:.4f}\r'.format(
prograss='#'*int(round(10*iteration/epoch_size)) + '-'*int(round(10*(1-iteration/epoch_size))), iters=iteration, epoch_size=epoch_size,
time=time, loc_loss=loss_l.data[0], cls_loss=loss_c.data[0])
sys.stdout.write(log)
sys.stdout.flush()
# eval mAP
prec, rec, ap = cal_pr(label, score, npos)
# log per epoch
sys.stdout.write('\r')
sys.stdout.flush()
log = '\r==>Eval: || Total_time: {time:.3f}s || loc_loss: {loc_loss:.4f} conf_loss: {conf_loss:.4f} || mAP: {mAP:.6f}\n'.format(mAP=ap,
time=_t.total_time, loc_loss=loc_loss/epoch_size, conf_loss=conf_loss/epoch_size)
sys.stdout.write(log)
sys.stdout.flush()
# log for tensorboard
writer.add_scalar('Eval/loc_loss', loc_loss/epoch_size, epoch)
writer.add_scalar('Eval/conf_loss', conf_loss/epoch_size, epoch)
writer.add_scalar('Eval/mAP', ap, epoch)
viz_pr_curve(writer, prec, rec, epoch)
viz_archor_strategy(writer, size, gt_label, epoch)
def test_epoch(self, model, data_loader, detector, output_dir, use_gpu):
model.eval()
dataset = data_loader.dataset
num_images = len(dataset)
num_classes = detector.num_classes
all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
empty_array = np.transpose(np.array([[],[],[],[],[]]),(1,0))
_t = Timer()
for i in iter(range((num_images))):
img = dataset.pull_image(i)
scale = [img.shape[1], img.shape[0], img.shape[1], img.shape[0]]
if use_gpu:
images = Variable(dataset.preproc(img)[0].unsqueeze(0).cuda())
else:
images = Variable(dataset.preproc(img)[0].unsqueeze(0))
_t.tic()
# forward
out = model(images, phase='eval')
# detect
detections = detector.forward(out)
time = _t.toc()
# TODO: make it smart:
for j in range(1, num_classes):
cls_dets = list()
for det in detections[0][j]:
if det[0] > 0:
d = det.cpu().numpy()
score, box = d[0], d[1:]
box *= scale
box = np.append(box, score)
cls_dets.append(box)
if len(cls_dets) == 0:
cls_dets = empty_array
all_boxes[j][i] = np.array(cls_dets)
# log per iter
log = '\r==>Test: || {iters:d}/{epoch_size:d} in {time:.3f}s [{prograss}]\r'.format(
prograss='#'*int(round(10*i/num_images)) + '-'*int(round(10*(1-i/num_images))), iters=i, epoch_size=num_images,
time=time)
sys.stdout.write(log)
sys.stdout.flush()
# write result to pkl
with open(os.path.join(output_dir, 'detections.pkl'), 'wb') as f:
pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
# currently the COCO dataset do not return the mean ap or ap 0.5:0.95 values
print('Evaluating detections')
data_loader.dataset.evaluate_detections(all_boxes, output_dir)
def visualize_epoch(self, model, data_loader, priorbox, writer, epoch, use_gpu):
model.eval()
img_index = random.randint(0, len(data_loader.dataset)-1)
# get img
image = data_loader.dataset.pull_image(img_index)
anno = data_loader.dataset.pull_anno(img_index)
# visualize archor box
viz_prior_box(writer, priorbox, image, epoch)
# get preproc
preproc = data_loader.dataset.preproc
preproc.add_writer(writer, epoch)
# preproc.p = 0.6
# preproc image & visualize preprocess prograss
images = Variable(preproc(image, anno)[0].unsqueeze(0))
if use_gpu:
images = images.cuda()
# visualize feature map in base and extras
base_out = viz_module_feature_maps(writer, model.base, images, module_name='base', epoch=epoch)
extras_out = viz_module_feature_maps(writer, model.extras, base_out, module_name='extras', epoch=epoch)
# visualize feature map in feature_extractors
viz_feature_maps(writer, model(images, 'feature'), module_name='feature_extractors', epoch=epoch)
model.train()
images.requires_grad = True
images.volatile=False
base_out = viz_module_grads(writer, model, model.base, images, images, preproc.means, module_name='base', epoch=epoch)
# TODO: add more...
def configure_optimizer(self, trainable_param, cfg):
if cfg.OPTIMIZER == 'sgd':
optimizer = optim.SGD(trainable_param, lr=cfg.LEARNING_RATE,
momentum=cfg.MOMENTUM, weight_decay=cfg.WEIGHT_DECAY)
elif cfg.OPTIMIZER == 'rmsprop':
optimizer = optim.RMSprop(trainable_param, lr=cfg.LEARNING_RATE,
momentum=cfg.MOMENTUM, alpha=cfg.MOMENTUM_2, eps=cfg.EPS, weight_decay=cfg.WEIGHT_DECAY)
elif cfg.OPTIMIZER == 'adam':
optimizer = optim.Adam(trainable_param, lr=cfg.LEARNING_RATE,
betas=(cfg.MOMENTUM, cfg.MOMENTUM_2), eps=cfg.EPS, weight_decay=cfg.WEIGHT_DECAY)
else:
AssertionError('optimizer can not be recognized.')
return optimizer
def configure_lr_scheduler(self, optimizer, cfg):
if cfg.SCHEDULER == 'step':
scheduler = lr_scheduler.StepLR(optimizer, step_size=cfg.STEPS[0], gamma=cfg.GAMMA)
elif cfg.SCHEDULER == 'multi_step':
scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=cfg.STEPS, gamma=cfg.GAMMA)
elif cfg.SCHEDULER == 'exponential':
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=cfg.GAMMA)
elif cfg.SCHEDULER == 'SGDR':
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.MAX_EPOCHS)
else:
AssertionError('scheduler can not be recognized.')
return scheduler
def export_graph(self):
self.model.train(False)
dummy_input = Variable(torch.randn(1, 3, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[1])).cuda()
# Export the model
torch_out = torch.onnx._export(self.model, # model being run
dummy_input, # model input (or a tuple for multiple inputs)
"graph.onnx", # where to save the model (can be a file or file-like object)
export_params=True) # store the trained parameter weights inside the model file
# if not os.path.exists(cfg.EXP_DIR):
# os.makedirs(cfg.EXP_DIR)
# self.writer.add_graph(self.model, (dummy_input, ))
def train_model():
s = Solver()
s.train_model()
return True
def test_model():
s = Solver()
s.test_model()
return True