I have single machine with two GPUs.This errors occurred when I used this command ‘CUDA_VISIBLE_DEVICES=1,0 python -m torch.distributed.launch --nproc_per_node=2 train.py’ train my model parallelly.
Here’s my code, could anyone help me?
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
torch.distributed.init_process_group(backend='nccl')
parser = argparse.ArgumentParser(description='param')
parser.add_argument('--iters', default=10,type=str)
parser.add_argument('--data_size', default=2048,type=int)
parser.add_argument('--batch_size', default=256,type=int)
parser.add_argument('--loss_name', default='KL',type=str)
parser.add_argument('--lr', default=0.01,type=int)
parser.add_argument('--reg_param', default=0.1,type=int)
parser.add_argument('--save_loss_path', default='./',type=str)
parser.add_argument('--use_gpu', type=bool, default=False)
def cleanup():
dist.destroy_process_group()
def train(iters,
data_size,
batch_size,
loss_name,
lr,
reg_param,
save_loss_path,
use_gpu):
save_loss_csv = save_loss_path + loss_name + '.csv'
create_csv_4_KL(path=save_loss_csv)
atlas = np.load(atlas_file)
if use_gpu:
model = Model().to(device)
model = torch.nn.parallel.DistributedDataParallel(model)
else:
model = Model()
opt = Adam(model.parameters(), lr=lr)
if loss_name == 'KL':
from losses import KL_Divergence
loss_fun = KL_Divergence
elif loss_name == 'MSE':
from losses import mse_loss
loss_fun = mse_loss
elif loss_name == 'NCC':
from losses import ncc_loss
loss_fun = ncc_loss
else:
print("There's no such a loss fuction {}".format(loss_name))
import losses
Grad_loss = losses.gradient_loss
train_generator = DataGenerater(json_path=json_path, data_size=data_size)
train_set = DataLoader(train_generator, batch_size=batch_size, shuffle=True, num_workers=16,
sampler=DistributedSampler(train_generator))
reg_param = reg_param
fixed = torch.Tensor(atlas)
fixed.unsqueeze_(0)
fixed.unsqueeze_(0)
if use_gpu:
fixed = fixed.expand(batch_size, 1, 128, 128, 128).cuda()
fixed = fixed.expand(batch_size, 1, 128, 128, 128)
fixed_norm = fixed / 255
if use_gpu:
fixed_norm = fixed_norm.to(device)
for epoch in range(iters):
start_time = time.time()
loss_epoch = 0.0
for i, batch_moving in enumerate(train_set):
if use_gpu:
batch_moving_cuda = batch_moving.cuda()
else:
batch_moving_cuda = batch_moving
batch_moving_cuda_norm = batch_moving_cuda / 255
wrap, flow = model(batch_moving_cuda_norm, fixed_norm)
loss = loss_fun(wrap, fixed_norm) + reg_param * Grad_loss(flow)
loss_epoch += loss.item()
opt.zero_grad()
loss.backward()
opt.step()
append_csv(save_loss_csv,
zip([[epoch + 1]], [loss_epoch]))
end_time = time.time()
loop_cost = end_time - start_time
print("After [ {} ] seconds and {} epoches, selected the {} loss to train, the loss is [ {} ]."
.format(loop_cost, epoch + 1, loss_name, loss_epoch / (2048 / batch_size)))
para_save_file = save_loss_path + 'res/' + 'MyModel-slice-{}-{}-{}-{}.pth'.format(loss_name, iters, reg_param, now)
if os.path.exists(para_save_file):
os.remove(para_save_file)
torch.save(model.state_dict(), para_save_file)
print("The model saved in {}".format(para_save_file))
if __name__ == "__main__":
args = parser.parse_args()
now = datetime.now().date()
json_path = '/home/mamingrui/code/MyModel/brain.json'
atlas_file = '/home/mamingrui/data/atlas/atlas.npy',
# initialize the process group
dist.init_process_group("nccl")
local_rank = torch.distributed.get_rank()
torch.cuda.set_device(local_rank)
device = torch.device("cuda", local_rank)
train(iters=args.iters,
data_size=args.data_size,
batch_size=args.batch_size,
loss_name=args.loss_name,
lr=args.lr,
reg_param=args.reg_param,
save_loss_path=args.save_loss_path,
use_gpu=args.use_gpu)
cleanup()
The errro report below
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
usage: train.py [-h] [--iters ITERS] [--data_size DATA_SIZE]
[--batch_size BATCH_SIZE] [--loss_name LOSS_NAME] [--lr LR]
[--reg_param REG_PARAM] [--save_loss_path SAVE_LOSS_PATH]
[--use_gpu USE_GPU]
train.py: error: unrecognized arguments: --local_rank=0
usage: train.py [-h] [--iters ITERS] [--data_size DATA_SIZE]
[--batch_size BATCH_SIZE] [--loss_name LOSS_NAME] [--lr LR]
[--reg_param REG_PARAM] [--save_loss_path SAVE_LOSS_PATH]
[--use_gpu USE_GPU]
train.py: error: unrecognized arguments: --local_rank=1
Traceback (most recent call last):
File "/home/mamingrui/anaconda3/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/mamingrui/anaconda3/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/mamingrui/anaconda3/lib/python3.7/site-packages/torch/distributed/launch.py", line 253, in <module>
main()
File "/home/mamingrui/anaconda3/lib/python3.7/site-packages/torch/distributed/launch.py", line 249, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/home/mamingrui/anaconda3/bin/python', '-u', 'train.py', '--local_rank=1']' returned non-zero exit status 2.