Hi, I am new to pytorch and I am facing issues when I am trying to run multigpu using Horovod. Even torch.cude.device.count() is 6 but it is using only one gpu and if I print local rank I get 0. I would be very grateful for your help.
I use this command for submitting the job
jsrun -bpacked:7 -g6 -a1 -c42 -r1 python ./pytorch_code.py
Brief version of my code
import math
import os
import sys
import numpy as np
READ ALL THESE FROM A FILE
import rough_copy
import torch.optim as optim
DECAY_UPDATE_INTERVAL = 25
BATCH_SIZE = 2
HOROVOD
print("HVD AREA ")
backend = âhorovodâ
world_size = int(os.environ[âOMPI_COMM_WORLD_SIZEâ])
world_rank = int(os.environ[âOMPI_COMM_WORLD_RANKâ])
local_rank = int(os.environ[âOMPI_COMM_WORLD_LOCAL_RANKâ])
Horovod and DDL share the same usage mode
if backend in [âhorovodâ,âddlâ]:
print(âusing hvdâ)
import horovod.torch as hvd
hvd.init()
torch.cuda.set_device(local_rank)
print("local rank "+str(local_rank))
else:
import torch.distributed as dist
# Initialze NCCL backend with envrionment variables
if backend == âncclâ:
import subprocess
get_master = âecho $(cat {} | sort | uniq | grep -v batch | grep -v login | head -1)â.format(os.environ[âLSB_DJOB_HOSTFILEâ])
os.environ[âMASTER_ADDRâ] = str(subprocess.check_output(get_master, shell=True))[2:-3]
os.environ[âMASTER_PORTâ] = â23456â
os.environ[âWORLD_SIZEâ] = os.environ[âOMPI_COMM_WORLD_SIZEâ]
os.environ[âRANKâ] = os.environ[âOMPI_COMM_WORLD_RANKâ]
# NCCL and MPI initialization
print(âusing ncllâ)
dist.init_process_group(backend,
rank=world_rank, world_size=world_size)
print(âNONEâ)
print("device ss "+str(torch.cuda.device_count()))
class my_dataset(data.Dataset):
def initialize(self, _feat_path, _tr_ros, _ss, _intra, _label_path, _file_list, _max_len):
.
.
def __getitem__(self, index):
.
.
.
def __len__(self):
return self.size
nThreads =6
train_file_list = text_file_reader(train_file)
train_dataset = my_dataset()
train_dataset.initialize(_feat_path=feature_path, _label_path=label_path, _ss=ss_path, _intra=intra_path,
_tr_ros=tr_path,
_file_list=train_file_list, _max_len=MAX_LENGTH)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=nThreads)
val_file_list = text_file_reader(val_file)
val_dataset = my_dataset()
val_dataset.initialize(_feat_path=feature_path, _label_path=label_path, _ss=ss_path, _intra=intra_path, _tr_ros=tr_path,
_file_list=val_file_list, _max_len=MAX_LENGTH)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=nThreads)
print("MODELLOADER AREA ")
input = torch.randn((1, FEATURES, MAX_LENGTH, MAX_LENGTH)).cuda()
import rough_copy
model = rough_copy.ResNet_custom(img_channel=FEATURES, num_classes=MAX_LENGTH * MAX_LENGTH, _depth=RESNET_DEPTH).cuda()
output = model(input)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=DECAY_UPDATE_INTERVAL, gamma=0.5)
print("HVD AREA ")
if backend in [âhorovodâ,âddlâ]:
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(optimizer,
named_parameters=model.named_parameters())
# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
else:
def average_gradients(model):
ââ" Gradient averaging. ââ"
for param in model.parameters():
dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
param.grad.data /= world_size
print("Restart AREA ")
Current_EPOCH = 0
weights = list(filter(os.path.isfile, glob.glob(HISTORY_FILE + â*â)))
weights.sort(key=lambda x: os.path.getmtime(x))
if len(weights) > 0:
checkpoint=torch.load(weights[len(weights)-1])
model.load_state_dict(checkpoint[âmodel_state_dictâ])
optimizer.load_state_dict(checkpoint[âoptimizer_state_dictâ])
scheduler.load_state_dict(checkpoint[âscheduler_state_dictâ])
epoch = checkpoint[âepochâ]
Current_EPOCH =epoch+1
model.train()
print("EPOCH AREA ")
for epoch in range(Current_EPOCH, EPOCH):
model.train()
print("TRAINGNG AREA ")
for i, data in enumerate(train_dataloader):
features = data[âfeatâ].float().cuda()
label = data[âground_truthâ].float().cuda()
optimizer.zero_grad()
output_final = torch.squeeze(model(features))
loss = criterion(torch.squeeze(output_val), torch.squeeze(label))
print(loss)
running_train_loss += loss.item()
loss.backward()
optimizer.step()
scheduler.step()
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler_state_dict': scheduler.state_dict()
}, HISTORY_FILE + "/weighths_" + str(epoch))
print("VALIDATIOn AREA ")
with torch.no_grad():
"some custom codes"
Current_EPOCH = Current_EPOCH + 1
Output of horovodrun --check-build
Available Frameworks:
[X] TensorFlow
[X] PyTorch
[ ] MXNet
Available Controllers:
[X] MPI
[ ] Gloo
Available Tensor Operations:
[X] NCCL
[ ] DDL
[ ] CCL
[X] MPI
[ ] Gloo