I am trying to run distributed data-parallel on a single node with 3 GPUs to maximise GPU utility which is currently very low.
After following multiple tutorials, the following is my code(I have tried to add a minimal example, let me know if anything is not clear and I’ll add more) but it is exiting without doing anything on running -
#: before any statement represents minimal code I have provided
#All the required imports
#setting of environment variables
def train(world_size, args):
#setting of all the seeds for deterministic behaviour
rank = args.nr * args.gpus + args.gpu
dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
gpu = args.world_size
#data_transform =
# root_dir =
#image_dataset = dataset.ImageFolder(train_dir)
train_loader = torch.utils.data.DataLoader(image_dataset['train'], batch_size=batch_size,shuffle = True,num_workers = 0,sampler = train_sampler)
valid_loader = torch.utils.data.DataLoader(image_dataset['valid'], batch_size=batch_size,shuffle = True,num_workers = 0,sampler= train_sampler)
model_transfer = models.resnet18(pretrained=True)
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = optim.SGD(model_transfer.fc.parameters(),lr = 0.001)
ddp_model_transfer = DDP(model_transfer, device_ids = [rank])
for epoch in range(1, args.epochs+1):
for batch_idx, (data, target) in enumerate(data_transfer['train']):
if use_cuda:
data, target = data.cuda(), target.cuda()
optimizer.zero_grad()
output = ddp_model_transfer(data)
loss = criterion(output,target)
loss.backward()
optimizer.step()
train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
##Vadlidation
model.eval()
for batch_idx, (data, target) in enumerate(data_transfer['valid']):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
## update the average validation loss
output = ddp_model_transfer(data).to(rank)
loss = criterion(output,target)
valid_loss += valid_loss + ((1 / (batch_idx + 1)) * (loss.data - valid_loss))
pred = output.data.max(1, keepdim=True)[1]
correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
total += data.size(0)
if valid_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(
valid_loss_min,valid_loss))
torch.save(ddp_model_transfer.state_dict(), 'case_1_model.pt')
valid_loss_min = valid_loss
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--epochs', default=2, type=int,metavar = 'N', help='number of total epochs to run')
parser.add_argument('-g', '--gpus', default=1, type=int,help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,help='ranking within the nodes')
args = parser.parse_args()
world_size = 3
mp.spawn(train,args = (world_size,args),nprocs = world_size,join=True)
if __name__=="main":
main()