Horovod using only one gpu instead of all avaialable

Hi, I am new to pytorch and I am facing issues when I am trying to run multigpu using Horovod. Even torch.cude.device.count() is 6 but it is using only one gpu and if I print local rank I get 0. I would be very grateful for your help.
I use this command for submitting the job :slight_smile:
jsrun -bpacked:7 -g6 -a1 -c42 -r1 python ./pytorch_code.py

Brief version of my code

import math
import os
import sys
import numpy as np

READ ALL THESE FROM A FILE

import rough_copy
import torch.optim as optim

DECAY_UPDATE_INTERVAL = 25
BATCH_SIZE = 2

HOROVOD

print("HVD AREA ")
backend = ‘horovod’
world_size = int(os.environ[‘OMPI_COMM_WORLD_SIZE’])
world_rank = int(os.environ[‘OMPI_COMM_WORLD_RANK’])
local_rank = int(os.environ[‘OMPI_COMM_WORLD_LOCAL_RANK’])

Horovod and DDL share the same usage mode

if backend in [‘horovod’,‘ddl’]:
print(“using hvd”)
import horovod.torch as hvd
hvd.init()
torch.cuda.set_device(local_rank)
print("local rank "+str(local_rank))
else:
import torch.distributed as dist
# Initialze NCCL backend with envrionment variables
if backend == ‘nccl’:
import subprocess
get_master = “echo $(cat {} | sort | uniq | grep -v batch | grep -v login | head -1)”.format(os.environ[‘LSB_DJOB_HOSTFILE’])
os.environ[‘MASTER_ADDR’] = str(subprocess.check_output(get_master, shell=True))[2:-3]
os.environ[‘MASTER_PORT’] = “23456”
os.environ[‘WORLD_SIZE’] = os.environ[‘OMPI_COMM_WORLD_SIZE’]
os.environ[‘RANK’] = os.environ[‘OMPI_COMM_WORLD_RANK’]
# NCCL and MPI initialization
print(“using ncll”)
dist.init_process_group(backend,
rank=world_rank, world_size=world_size)
print(“NONE”)
print("device ss "+str(torch.cuda.device_count()))

class my_dataset(data.Dataset):
def initialize(self, _feat_path, _tr_ros, _ss, _intra, _label_path, _file_list, _max_len):
.
.

def __getitem__(self, index):
    .
    .
    .
def __len__(self):
    return self.size

nThreads =6

train_file_list = text_file_reader(train_file)
train_dataset = my_dataset()
train_dataset.initialize(_feat_path=feature_path, _label_path=label_path, _ss=ss_path, _intra=intra_path,
_tr_ros=tr_path,
_file_list=train_file_list, _max_len=MAX_LENGTH)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=nThreads)

val_file_list = text_file_reader(val_file)
val_dataset = my_dataset()
val_dataset.initialize(_feat_path=feature_path, _label_path=label_path, _ss=ss_path, _intra=intra_path, _tr_ros=tr_path,
_file_list=val_file_list, _max_len=MAX_LENGTH)

val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=nThreads)

print("MODELLOADER AREA ")
input = torch.randn((1, FEATURES, MAX_LENGTH, MAX_LENGTH)).cuda()
import rough_copy

model = rough_copy.ResNet_custom(img_channel=FEATURES, num_classes=MAX_LENGTH * MAX_LENGTH, _depth=RESNET_DEPTH).cuda()
output = model(input)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=DECAY_UPDATE_INTERVAL, gamma=0.5)
print("HVD AREA ")

if backend in [‘horovod’,‘ddl’]:
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(optimizer,
named_parameters=model.named_parameters())
# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
else:
def average_gradients(model):
“”" Gradient averaging. “”"
for param in model.parameters():
dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
param.grad.data /= world_size

print("Restart AREA ")

Current_EPOCH = 0
weights = list(filter(os.path.isfile, glob.glob(HISTORY_FILE + “*”)))
weights.sort(key=lambda x: os.path.getmtime(x))
if len(weights) > 0:
checkpoint=torch.load(weights[len(weights)-1])
model.load_state_dict(checkpoint[‘model_state_dict’])
optimizer.load_state_dict(checkpoint[‘optimizer_state_dict’])
scheduler.load_state_dict(checkpoint[‘scheduler_state_dict’])
epoch = checkpoint[‘epoch’]
Current_EPOCH =epoch+1
model.train()

print("EPOCH AREA ")
for epoch in range(Current_EPOCH, EPOCH):
model.train()
print("TRAINGNG AREA ")
for i, data in enumerate(train_dataloader):
features = data[‘feat’].float().cuda()
label = data[‘ground_truth’].float().cuda()
optimizer.zero_grad()
output_final = torch.squeeze(model(features))
loss = criterion(torch.squeeze(output_val), torch.squeeze(label))
print(loss)
running_train_loss += loss.item()
loss.backward()
optimizer.step()
scheduler.step()

torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict()
}, HISTORY_FILE + "/weighths_" + str(epoch))

print("VALIDATIOn AREA ")
with torch.no_grad():
    "some custom codes"
Current_EPOCH = Current_EPOCH + 1

Output of horovodrun --check-build

Available Frameworks:
[X] TensorFlow
[X] PyTorch
[ ] MXNet

Available Controllers:
[X] MPI
[ ] Gloo

Available Tensor Operations:
[X] NCCL
[ ] DDL
[ ] CCL
[X] MPI
[ ] Gloo

Your code is currently a bit hard to read so could you please format it by wrapping it into three backticks ```? :slight_smile:

Is the local_rank always set to 0 and are thus all processes using the same device?
I’m not deeply familiar with Horovod, but I assume you’ve checked this doc already?

I apologize for the previous dirty code
torch.cuda.device.count() gives 6 but it is using only one gpu and if I print local rank I get 0.
I use this command for submitting the job

jsrun -bpacked:7 -g6 -a1 -c42 -r1 python ./pytorch_code.py
My CODE
BATCH_SIZE = 2

backend = ‘horovod’
world_size = int(os.environ[‘OMPI_COMM_WORLD_SIZE’])
world_rank = int(os.environ[‘OMPI_COMM_WORLD_RANK’])
local_rank = int(os.environ[‘OMPI_COMM_WORLD_LOCAL_RANK’])
if backend in [‘horovod’,‘ddl’]:
    import horovod.torch as hvd
    hvd.init()
torch.cuda.set_device(local_rank)
import torch.distributed as dist
if backend == ‘nccl’:
    import subprocess
    get_master = “echo $(cat {} | sort | uniq | grep -v batch | grep -v login | head -1)”.format(os.environ[‘LSB_DJOB_HOSTFILE’])
    os.environ[‘MASTER_ADDR’] = str(subprocess.check_output(get_master, shell=True))[2:-3]
    os.environ[‘MASTER_PORT’] = “23456”
    os.environ[‘WORLD_SIZE’] = os.environ[‘OMPI_COMM_WORLD_SIZE’]
    os.environ[‘RANK’] = os.environ[‘OMPI_COMM_WORLD_RANK’]
    # NCCL and MPI initialization
    print(“using ncll”)
    dist.init_process_group(backend,
    rank=world_rank, world_size=world_size)

class my_dataset(data.Dataset):
    def initialize(self, _feat_path, _tr_ros, _ss, _intra, _label_path, _file_list, _max_len):
    .
    .

    def __getitem__(self, index):
    .
    .
    .
    def __len__(self):
        return self.size

nThreads =6
train_file_list = text_file_reader(train_file)
train_dataset = my_dataset()
train_dataset.initialize( ... )
_file_list=train_file_list, _max_len=MAX_LENGTH)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=nThreads)

val_file_list = text_file_reader(val_file)
val_dataset = my_dataset()
val_dataset.initialize(...)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=nThreads)

input = torch.randn((1, FEATURES, MAX_LENGTH, MAX_LENGTH)).cuda()
import rough_copy
model = rough_copy.ResNet_custom(img_channel=FEATURES, num_classes=MAX_LENGTH * MAX_LENGTH, _depth=RESNET_DEPTH).cuda()
output = model(input)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=DECAY_UPDATE_INTERVAL, gamma=0.5)


if backend in [‘horovod’,‘ddl’]:
    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(optimizer,
    named_parameters=model.named_parameters())
    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
else:
    def average_gradients(model):
    “”" Gradient averaging. “”"
    for param in model.parameters():
    dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)
    param.grad.data /= world_size
 
for epoch in range(Current_EPOCH, EPOCH):
    model.train()
    for i, data in enumerate(train_dataloader):
    features = data[‘feat’].float().cuda()
    label = data[‘ground_truth’].float().cuda()
    optimizer.zero_grad()
    output_final = torch.squeeze(model(features))
    loss = criterion(torch.squeeze(output_val), torch.squeeze(label))
    print(loss)
    running_train_loss += loss.item()
    loss.backward()
    optimizer.step()
    scheduler.step()
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict()
    }, HISTORY_FILE + "/weighths_" + str(epoch))


    Current_EPOCH = Current_EPOCH + 1

Output of horovodrun --check-build


Available Frameworks:
[X] TensorFlow
[X] PyTorch
[ ] MXNet

Available Controllers:
[X] MPI
[ ] Gloo

Available Tensor Operations:
[X] NCCL
[ ] DDL
[ ] CCL
[X] MPI
[ ] Gloo

Thanks for the updated code! Is the linked example working fine for you?

Hi
actually, the problem was with train_sampler, after using that my problem got fixed