import logging
import os
import argparse
import sys
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, models
from model import FlipkartDetector, NeuralDecisionForest, Backbone, Forest,MNet
from tensorboardX import SummaryWriter
from utils import TrainLoader, AverageMeter, Params, bbox_iou, save_checkpoint, set_logger, paramsforepoch, adjustlrwd,LocLoader
#from data_aug import Rotate,RandomHorizontalFlip
from locnet import LocNet
import torch.nn.functional as F
from object_localization.decode_bbox import *
warnings.filterwarnings('ignore')
parser = argparse.ArgumentParser(description='Training Script')
parser.add_argument('--tag', default='resnet', type=str)
parser.add_argument('--resume', default=False, type=bool)
parser.add_argument('--params_path', default='params.json', type=str)
parser.add_argument('--batch_size', default=16, type=int)
def main():
args = parser.parse_args()
assert os.path.isfile(args.params_path), 'params.json file not present'
params = Params(args.params_path)
global batch_size
batch_size = args.batch_size
global conf
conf = {}
conf['scale_ratio'] = 1.2
conf['resolution'] = 128
if not os.path.isdir(args.tag):
os.mkdir(args.tag)
os.mkdir(os.path.join(args.tag, 'Weights'))
set_logger(os.path.join(args.tag, 'train.log'))
logging.info('Dataset Loading Started')
'''
#train_transform = transforms.Compose([
#RandomHorizontalFlip(0.5),
#Rotate(90,0.1),
#])
'''
valid_transform = False
df_target =pd.read_csv('training.csv')
df_in=pd.read_csv('train_in_total.csv')
df_train_trg, df_val_trg,df_train_in,df_val_in = train_test_split(df_target,df_in, test_size = 0.2, random_state = 100)
train_dataset = LocLoader(df_train_in,df_train_trg, params.path, 480, 640)
valid_dataset = LocLoader(df_val_in,df_val_trg, params.path, 480, 640)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
num_workers=params.num_workers)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False,
num_workers=params.num_workers)
train_size = len(train_loader)
val_size = len(valid_loader)
print('Number of training images: {}'.format(train_size * params.batch_size))
print('Number of Validation images: {}'.format(val_size * params.batch_size))
logging.info('Dataset Loaded')
torch.backends.cudnn.benchmark = False
model= LocNet(window_width=40,window_height=30, pretrained=True)
model = nn.DataParallel(model)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([127]).float())
optimizer = optim.SGD(model.parameters(), lr=params.lr, momentum=0.9,weight_decay=5e-4, nesterov=True)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
if args.resume:
weight_path = sorted(os.listdir(os.path.join(args.tag, 'Weights')), key=lambda x: float(x[:-8]))[0]
checkpoint = torch.load(os.path.join(args.tag, 'Weights', weight_path))
model.load_state_dict(checkpoint['state_dict'])
logging.info('Loaded Checkpoint of epoch: {}'.format(weight_path))
print('Loaded Checkpoint of epoch: {}'.format(weight_path))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
is_gpu = torch.cuda.is_available()
if is_gpu:
logging.info('GPU found')
model = model.cuda()
criterion = criterion.cuda()
writer = SummaryWriter('logs/{}'.format(args.tag))
logging.info('Training Started')
for epoch in tqdm(range(params.epochs)):
prams = paramsforepoch(epoch +1)
print("Configuring optimizer with lr={:.5f} and weight_decay={:.4f}".format(prams['learning_rate'],prams['weight_decay']))
adjustlrwd(prams, optimizer)
train(model, train_loader, criterion, optimizer, scheduler, epoch, writer, is_gpu, train_size)
print('')
valid_loss = valid(model, valid_loader, criterion, epoch, writer, is_gpu, val_size)
print('')
save_checkpoint({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
}, filename=os.path.join(args.tag, 'Weights', str(valid_loss) + '.pth.tar'))
def train(model, dataloader, criterion, optimizer, scheduler, epoch, writer, is_gpu, train_size):
running_loss = AverageMeter()
running_iou = AverageMeter()
model = model.train()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#scheduler.step(epoch)
for i, (inputs,prob_vectors,bbox_search_region,bbox_target) in enumerate(dataloader):
indexes = torch.arange(bbox_search_region.shape[0]).view(bbox_search_region.shape[0],-1).float()
#index = torch.LongTensor([0,1,4,3,2])
bbox_search_region_for_roi = torch.cat((indexes, bbox_search_region),1)
#[:,index]
#bbox_search_region_for_roi = bbox_search_region[:,index].clone()
if is_gpu:
# print(inputs.dtype)
inputs= inputs.cuda()
prob_vectors=prob_vectors.cuda()
bbox_search_region= bbox_search_region.cuda()
bbox_search_region_for_roi= bbox_search_region_for_roi.cuda()
bbox_target=bbox_target.cuda()
linear_x_l,linear_x_r,linear_y_t,linear_y_b = model(inputs, bbox_search_region_for_roi)
loss_xl = criterion(linear_x_l,prob_vectors[:,:,0])
loss_xr = criterion(linear_x_r,prob_vectors[:,:,1])
loss_yb = criterion(linear_y_b,prob_vectors[:,:,2])
loss_yt = criterion(linear_y_t,prob_vectors[:,:,3])
loss=loss_xl+loss_xr+loss_yb+loss_yt
output_prob = F.sigmoid(torch.stack([linear_x_l,linear_x_r,linear_y_b,linear_y_t]).permute((2,0,1)))
optimizer.zero_grad()
loss.backward()
optimizer.step()
#prob_vectors = prob_vectors.permute((1,2,0))
decoded_coords = decode_loc_probs_to_bbox_targets(bbox_search_region,output_prob, conf)
iou = bbox_iou(decoded_coords.float(), bbox_target.float())
running_loss.update(loss.item(), inputs.size(0))
running_iou.update(iou, inputs.size(0))
sys.stdout.write('\r')
sys.stdout.write('Training Epoch: [{0}][{1}/{2}]\t'
'Loss ({loss.avg:.4f})\t'
'iou {iou.avg:.4f}'.format(epoch + 1, i, train_size, loss=running_loss ,iou=running_iou))
sys.stdout.flush()
n_iter = (epoch * train_size) + i
writer.add_scalar('Training_Loss', running_loss.avg, n_iter)
#writer.add_scalar('Training_iou', running_iou.avg, n_iter)
logging.info(
'Training Epoch: {}\t Loss: {}\t'
'Training IOU :{}'.format(epoch + 1, running_loss.avg,running_iou.avg))
def valid(model, dataloader, criterion, epoch, writer, is_gpu, val_size):
running_loss = AverageMeter()
running_iou = AverageMeter()
model = model.eval()
with torch.no_grad():
for i, (inputs,prob_vectors,bbox_search_region,bbox_target) in enumerate(dataloader):
indexes = torch.arange(bbox_search_region.shape[0]).view(bbox_search_region.shape[0],-1).float()
#index = torch.LongTensor([0,1,4,3,2])
bbox_search_region_for_roi= torch.cat((indexes, bbox_search_region),1)
#[:,index]
#bbox_search_region_for_roi = bbox_search_region[:,index]
if is_gpu:
inputs,prob_vectors= inputs.cuda(), prob_vectors.cuda()
bbox_search_region= bbox_search_region.cuda()
bbox_search_region_for_roi= bbox_search_region_for_roi.cuda()
bbox_target=bbox_target.cuda()
linear_x_l,linear_x_r,linear_y_t,linear_y_b = model(inputs, bbox_search_region_for_roi)
loss_xl = criterion(linear_x_l,prob_vectors[:,:,0])
loss_xr = criterion(linear_x_r,prob_vectors[:,:,1])
loss_yb = criterion(linear_y_b,prob_vectors[:,:,2])
loss_yt = criterion(linear_y_t,prob_vectors[:,:,3])
loss=loss_xl+loss_xr+loss_yb+loss_yt
#prob_vectors = prob_vectors.permute((1,2,0))
output_prob = F.sigmoid(torch.stack([linear_x_l,linear_x_r,linear_y_t,linear_y_b]).permute((2,0,1)))
decoded_coords = decode_loc_probs_to_bbox_targets(bbox_search_region,output_prob, conf)
iou = bbox_iou(decoded_coords.float(), bbox_target.float())
running_loss.update(loss.item(), inputs.size(0))
running_iou.update(iou, inputs.size(0))
sys.stdout.write('\r')
sys.stdout.write('Validation Epoch: [{0}][{1}/{2}]\t'
'Loss ({loss.avg:.4f})\t'
'Val iou {iou.avg:.4f}'.format(epoch + 1, i, val_size, loss=running_loss,iou=running_iou))
sys.stdout.flush()
n_iter = (epoch * val_size) + i
writer.add_scalar('Validation_Loss', running_loss.avg, n_iter)
# writer.add_scalar('Validation_iou', running_iou.avg, n_iter)
logging.info(
'Validation Epoch: {}\t Loss: {}\t '
'Validation IOU: {}'.format(epoch + 1, running_loss.avg,running_iou.avg))
return running_loss.avg
if __name__ == '__main__':
main()
Hi, My training hangs in 1st epoch after training function i.e. just before validation when I train it on p3.8x having 4 gpu and 32 cores. I’ve tried different nymber of num workers including 0, as well as I’ve tried pinning memory etc. but nothing works. The gpu usage becomes 0 and the terminal hangs with Ctrl + C not working…only Ctrl+Z . Initally, I faced error in last two batches maybe coz they were smaller than batch_size(=256). Therefore, I used drop_last=True after which I didn’t faced the same cuda error in last two batches but the terminal just hangs.
Note: It just works fine on p2x with one k80 gpu