Starting from this tutorial, I am trying to train a Faster R-CNN ResNet50 network on a custom dataset.
The train partition contains 26188 images that are 512x512 but, when loaded, they get resized at 240x240. To train on all the train data set for just one epoch it took 14 hours.
I’m trying to debug where the bottleneck(s) are.
I’m pretty sure everything is running on the gpu because when i do nvidia-smi Volatile GPU-Util
is always around 99%.
Benchmarking using time.time() with batch size 10, 8 workers and a dataset of 80 elements, for the first epoch I obtain:
LOAD TIME 0.6659998893737793
TRAIN TIME 7.46896505355835
LOSS TIME 5.605570554733276
LOAD TIME 4.123662710189819
TRAIN TIME 7.413051128387451
LOSS TIME 0.14319729804992676
LOAD TIME 9.588207483291626
TRAIN TIME 7.420511722564697
LOSS TIME 0.14394235610961914
LOAD TIME 9.581853866577148
TRAIN TIME 7.408280849456787
LOSS TIME 0.14383983612060547
So it takes 7 seconds to train on 10 images. Correct me if i’m wrong but I think it’s a bit too much time.
The relevant bit of my code is here:
def main(folder_path, csv_file, attempt_fd, resume=False, fromEpoch=0, num_epochs=2, re_evaluate=False, evaluate=False): device = torch.device('cuda') num_classes = 9 dataset = dld.DLDataset(csv_file, folder_path, transforms=get_transform(train=True)) dataset_test = dld.DLDataset(csv_file, folder_path, transforms=get_transform(train=False)) # split the dataset in train and test set indices = torch.randperm(len(dataset)).tolist() train_idx = np.load('train.npy') test_idx = np.load('test.npy') dataset = torch.utils.data.Subset(dataset, indices[0:80]) #np.asarray(train_idx) dataset_test = torch.utils.data.Subset(dataset_test, indices[0:5]) #np.asarray(test_idx) # define training and validation data loaders data_loader = torch.utils.data.DataLoader( dataset, batch_size=10, shuffle=True, num_workers=8, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=1, shuffle=False, num_workers=8, collate_fn=utils.collate_fn) # get the model using our helper function model = get_model_instance_segmentation(num_classes) model.double() model.to(device) #model.cuda(device) # construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.Adam(params, lr=0.001, weight_decay=0) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1) mAP = [] IoU = [] torch.cuda.empty_cache() #num_epochs = 2 if resume: fromEpoch, model, optimizer, lr_scheduler = loadState(attempt_fd, fromEpoch, model, optimizer, lr_scheduler) model.to(device) if fromEpoch>0: mAP = np.load(attempt_fd + 'mAP_' + str(fromEpoch) + '.npy') IoU = np.load(attempt_fd + 'IoU_'+ str(fromEpoch) + '.npy') fromEpoch += 1 if re_evaluate: t_mAP, t_iou = custom_evaluate(model, data_loader_test, device) print("EPOCH", fromEpoch-1, "mAP", t_mAP, "iou", t_iou) mAP = np.append(mAP, t_mAP) IoU = np.append(IoU, t_iou) np.save(attempt_fd + 'mAP_' + str(fromEpoch-1) + '.npy', mAP) np.save(attempt_fd + 'IoU_' + str(fromEpoch-1) + '.npy', IoU) for epoch in range(fromEpoch, num_epochs): # train for one epoch, printing every 10 iterations loss = train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=1) print("LOSS AT EPOCH", epoch, "IS", loss) # update the learning rate lr_scheduler.step() #save progress saveState(attempt_fd, epoch, model, optimizer, lr_scheduler, loss) #empty cache torch.cuda.empty_cache() if evaluate: # evaluate MYSELF t_mAP, t_iou = custom_evaluate(model, data_loader_test, device) print("EPOCH", epoch,"mAP", t_mAP, "iou", t_iou) mAP = np.append(mAP, t_mAP) IoU = np.append(IoU, t_iou) np.save(attempt_fd + 'mAP_' + str(epoch) + '.npy', mAP) np.save(attempt_fd + 'IoU_'+ str(epoch) + '.npy', IoU) #evaluate(model, data_loader_test, device=device) torch.cuda.empty_cache() print("That's it!")
I also modified the train_one_epoch
function:
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() #metric_logger = utils.MetricLogger(delimiter=" ") #metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) one = time.time() for images, targets in data_loader: #metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.unsqueeze(0).to(device) for k, v in t.items()} for t in targets] loaded = time.time() print("LOAD TIME", loaded-one) loss_dict = model(images, targets) train_time = time.time() print("TRAIN TIME", train_time - loaded) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() #losses_reduced = losses_reduced.detach() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() #if lr_scheduler is not None: #lr_scheduler.step() #metric_logger.update(loss=losses_reduced, **loss_dict_reduced) #metric_logger.update(lr=optimizer.param_groups[0]["lr"]) loss_time = time.time() print("LOSS TIME", loss_time-train_time) one = loss_time return loss_value
If anyone can give me tips, I would be very grateful, thank you!