PyTorch nn.DataParallel hang

import logging
import os
import argparse
import sys
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, models
from model import FlipkartDetector, NeuralDecisionForest, Backbone, Forest,MNet
from tensorboardX import SummaryWriter
from utils import TrainLoader,  AverageMeter, Params, bbox_iou, save_checkpoint, set_logger, paramsforepoch, adjustlrwd,LocLoader
#from data_aug import Rotate,RandomHorizontalFlip
from locnet import LocNet
import torch.nn.functional as F
from object_localization.decode_bbox import *
warnings.filterwarnings('ignore')
parser = argparse.ArgumentParser(description='Training Script')
parser.add_argument('--tag', default='resnet', type=str)
parser.add_argument('--resume', default=False, type=bool)
parser.add_argument('--params_path', default='params.json', type=str)
parser.add_argument('--batch_size', default=16, type=int)

def main():
    args = parser.parse_args()
    assert os.path.isfile(args.params_path), 'params.json file not present'
    params = Params(args.params_path)
    global batch_size
    batch_size = args.batch_size
    global conf
    conf = {}
    conf['scale_ratio'] = 1.2
    conf['resolution'] = 128

    if not os.path.isdir(args.tag):
        os.mkdir(args.tag)
        os.mkdir(os.path.join(args.tag, 'Weights'))

    set_logger(os.path.join(args.tag, 'train.log'))
    logging.info('Dataset Loading Started')
    '''
    #train_transform = transforms.Compose([
    #RandomHorizontalFlip(0.5),
    #Rotate(90,0.1),
    #])
    '''
    
    valid_transform = False
    df_target =pd.read_csv('training.csv')
    df_in=pd.read_csv('train_in_total.csv')
    df_train_trg, df_val_trg,df_train_in,df_val_in = train_test_split(df_target,df_in, test_size = 0.2, random_state = 100)
    
    train_dataset = LocLoader(df_train_in,df_train_trg, params.path, 480, 640)
    valid_dataset = LocLoader(df_val_in,df_val_trg, params.path, 480, 640)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              num_workers=params.num_workers)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False,
                              num_workers=params.num_workers)
    train_size = len(train_loader)
    val_size = len(valid_loader)
    print('Number of training images: {}'.format(train_size * params.batch_size))
    print('Number of Validation images: {}'.format(val_size * params.batch_size))
    logging.info('Dataset Loaded')
    torch.backends.cudnn.benchmark = False
    model= LocNet(window_width=40,window_height=30, pretrained=True)
    model = nn.DataParallel(model)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([127]).float())
    optimizer = optim.SGD(model.parameters(), lr=params.lr, momentum=0.9,weight_decay=5e-4, nesterov=True)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
    if args.resume:
        weight_path = sorted(os.listdir(os.path.join(args.tag, 'Weights')), key=lambda x: float(x[:-8]))[0]
        checkpoint = torch.load(os.path.join(args.tag, 'Weights', weight_path))
        model.load_state_dict(checkpoint['state_dict'])
        logging.info('Loaded Checkpoint of epoch: {}'.format(weight_path))
        print('Loaded Checkpoint of epoch: {}'.format(weight_path))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    is_gpu = torch.cuda.is_available()
    if is_gpu:
        logging.info('GPU found')
        model = model.cuda()
        criterion = criterion.cuda()

    writer = SummaryWriter('logs/{}'.format(args.tag))
    logging.info('Training Started')
    for epoch in tqdm(range(params.epochs)):
        prams = paramsforepoch(epoch +1)
        print("Configuring optimizer with lr={:.5f} and weight_decay={:.4f}".format(prams['learning_rate'],prams['weight_decay']))
        adjustlrwd(prams, optimizer)
        train(model, train_loader, criterion, optimizer, scheduler, epoch, writer, is_gpu, train_size)
        print('')
        valid_loss = valid(model, valid_loader, criterion, epoch, writer, is_gpu, val_size)
        print('')
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }, filename=os.path.join(args.tag, 'Weights', str(valid_loss) + '.pth.tar'))


def train(model, dataloader, criterion, optimizer, scheduler, epoch, writer, is_gpu, train_size):
    running_loss = AverageMeter()
    running_iou = AverageMeter()
    model = model.train()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #scheduler.step(epoch)
    for i, (inputs,prob_vectors,bbox_search_region,bbox_target) in enumerate(dataloader):
        indexes = torch.arange(bbox_search_region.shape[0]).view(bbox_search_region.shape[0],-1).float()
        #index = torch.LongTensor([0,1,4,3,2])
        bbox_search_region_for_roi = torch.cat((indexes, bbox_search_region),1)
        #[:,index]
        #bbox_search_region_for_roi = bbox_search_region[:,index].clone()
        if is_gpu:
            # print(inputs.dtype)
            inputs= inputs.cuda()
            prob_vectors=prob_vectors.cuda()
            bbox_search_region= bbox_search_region.cuda()
            bbox_search_region_for_roi= bbox_search_region_for_roi.cuda()
            bbox_target=bbox_target.cuda()
            
        linear_x_l,linear_x_r,linear_y_t,linear_y_b = model(inputs, bbox_search_region_for_roi)
        
        loss_xl = criterion(linear_x_l,prob_vectors[:,:,0])
        loss_xr = criterion(linear_x_r,prob_vectors[:,:,1])
        loss_yb = criterion(linear_y_b,prob_vectors[:,:,2])
        loss_yt = criterion(linear_y_t,prob_vectors[:,:,3])
        loss=loss_xl+loss_xr+loss_yb+loss_yt
        output_prob = F.sigmoid(torch.stack([linear_x_l,linear_x_r,linear_y_b,linear_y_t]).permute((2,0,1)))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #prob_vectors = prob_vectors.permute((1,2,0))
        decoded_coords = decode_loc_probs_to_bbox_targets(bbox_search_region,output_prob, conf)
        iou = bbox_iou(decoded_coords.float(), bbox_target.float())
        running_loss.update(loss.item(), inputs.size(0))
        running_iou.update(iou, inputs.size(0))
        sys.stdout.write('\r')
        sys.stdout.write('Training Epoch: [{0}][{1}/{2}]\t'
                         'Loss  ({loss.avg:.4f})\t'
                         'iou {iou.avg:.4f}'.format(epoch + 1, i, train_size, loss=running_loss ,iou=running_iou))
                                                         
        sys.stdout.flush()
        n_iter = (epoch * train_size) + i
        writer.add_scalar('Training_Loss', running_loss.avg, n_iter)
        #writer.add_scalar('Training_iou', running_iou.avg, n_iter)
    logging.info(
        'Training Epoch: {}\t Loss: {}\t' 
        'Training IOU :{}'.format(epoch + 1, running_loss.avg,running_iou.avg))

def valid(model, dataloader, criterion, epoch, writer, is_gpu, val_size):
    running_loss = AverageMeter()
    running_iou = AverageMeter()
    model = model.eval()
    with torch.no_grad():
        for i, (inputs,prob_vectors,bbox_search_region,bbox_target) in enumerate(dataloader):
            indexes = torch.arange(bbox_search_region.shape[0]).view(bbox_search_region.shape[0],-1).float()
            #index = torch.LongTensor([0,1,4,3,2])
            bbox_search_region_for_roi= torch.cat((indexes, bbox_search_region),1)
            #[:,index]
            #bbox_search_region_for_roi = bbox_search_region[:,index]
            if is_gpu:
                inputs,prob_vectors= inputs.cuda(), prob_vectors.cuda()
                bbox_search_region= bbox_search_region.cuda()
                bbox_search_region_for_roi= bbox_search_region_for_roi.cuda()
                bbox_target=bbox_target.cuda()
            linear_x_l,linear_x_r,linear_y_t,linear_y_b = model(inputs, bbox_search_region_for_roi)
            loss_xl = criterion(linear_x_l,prob_vectors[:,:,0])
            loss_xr = criterion(linear_x_r,prob_vectors[:,:,1])
            loss_yb = criterion(linear_y_b,prob_vectors[:,:,2])
            loss_yt = criterion(linear_y_t,prob_vectors[:,:,3])
            loss=loss_xl+loss_xr+loss_yb+loss_yt
            #prob_vectors = prob_vectors.permute((1,2,0))
            output_prob = F.sigmoid(torch.stack([linear_x_l,linear_x_r,linear_y_t,linear_y_b]).permute((2,0,1)))
            decoded_coords = decode_loc_probs_to_bbox_targets(bbox_search_region,output_prob, conf)
            iou = bbox_iou(decoded_coords.float(), bbox_target.float())
            running_loss.update(loss.item(), inputs.size(0))
            running_iou.update(iou, inputs.size(0))
            sys.stdout.write('\r')
            sys.stdout.write('Validation Epoch: [{0}][{1}/{2}]\t'
                             'Loss  ({loss.avg:.4f})\t'
                             'Val iou {iou.avg:.4f}'.format(epoch + 1, i, val_size, loss=running_loss,iou=running_iou))
                                                                 
            sys.stdout.flush()
            n_iter = (epoch * val_size) + i
            writer.add_scalar('Validation_Loss', running_loss.avg, n_iter)
           # writer.add_scalar('Validation_iou', running_iou.avg, n_iter)
    logging.info(
        'Validation Epoch: {}\t Loss: {}\t '
        'Validation IOU: {}'.format(epoch + 1, running_loss.avg,running_iou.avg))
    return running_loss.avg


if __name__ == '__main__':
    main()

Hi, My training hangs in 1st epoch after training function i.e. just before validation when I train it on p3.8x having 4 gpu and 32 cores. I’ve tried different nymber of num workers including 0, as well as I’ve tried pinning memory etc. but nothing works. The gpu usage becomes 0 and the terminal hangs with Ctrl + C not working…only Ctrl+Z . Initally, I faced error in last two batches maybe coz they were smaller than batch_size(=256). Therefore, I used drop_last=True after which I didn’t faced the same cuda error in last two batches but the terminal just hangs.
Note: It just works fine on p2x with one k80 gpu

@ptrblck @smth @fmassa @rasbt can you please help :slight_smile:

Your example uses nn.DataParallel and therefore is not related to distributed. It also doesn’t use multiprocessing. I recommend changing the title to something like PyTorch nn.DataParallel hang or similar to better convey what’s going on.

When the process hangs, can you acquire any more information, such as where it is hanging? Can you attach a debugger and get a stack trace, for example? Or identify the exact line in your Python script where the hang occurs?

I think the process hangs just before the training loop going to complete…in last 2 iterations. The first two lines in training loop has previously caused me “Cannot join thread” error but then for further runs the process just hangs. I’m not able to attach a debugger for my remote instance. I’ll try by connecting an IDE for debugging. Thanks!

try to set num_workers=0 and try again.
Also, Try to give information about pytorch, python versions in the question.