Error: unrecognized arguments: --local_rank=1

mamemoryyy111 · June 1, 2020, 9:34am

I have single machine with two GPUs.This errors occurred when I used this command ‘CUDA_VISIBLE_DEVICES=1,0 python -m torch.distributed.launch --nproc_per_node=2 train.py’ train my model parallelly.
Here’s my code, could anyone help me?

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
torch.distributed.init_process_group(backend='nccl')

parser = argparse.ArgumentParser(description='param')
parser.add_argument('--iters', default=10,type=str)
parser.add_argument('--data_size', default=2048,type=int)
parser.add_argument('--batch_size', default=256,type=int)
parser.add_argument('--loss_name', default='KL',type=str)
parser.add_argument('--lr', default=0.01,type=int)
parser.add_argument('--reg_param', default=0.1,type=int)
parser.add_argument('--save_loss_path', default='./',type=str)
parser.add_argument('--use_gpu', type=bool, default=False)


def cleanup():
    dist.destroy_process_group()


def train(iters,
          data_size,
          batch_size,
          loss_name,
          lr,
          reg_param,
          save_loss_path,
          use_gpu):

    save_loss_csv = save_loss_path + loss_name + '.csv'
    create_csv_4_KL(path=save_loss_csv)
    atlas = np.load(atlas_file)

    if use_gpu:
        model = Model().to(device)
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        model = Model()

    opt = Adam(model.parameters(), lr=lr)

    if loss_name == 'KL':
        from losses import KL_Divergence

        loss_fun = KL_Divergence

    elif loss_name == 'MSE':
        from losses import mse_loss

        loss_fun = mse_loss

    elif loss_name == 'NCC':
        from losses import ncc_loss

        loss_fun = ncc_loss

    else:
        print("There's no such a loss fuction {}".format(loss_name))

    import losses

    Grad_loss = losses.gradient_loss

    train_generator = DataGenerater(json_path=json_path, data_size=data_size)
    train_set = DataLoader(train_generator, batch_size=batch_size, shuffle=True, num_workers=16,
                           sampler=DistributedSampler(train_generator))

    reg_param = reg_param

    fixed = torch.Tensor(atlas)
    fixed.unsqueeze_(0)
    fixed.unsqueeze_(0)
    if use_gpu:
        fixed = fixed.expand(batch_size, 1, 128, 128, 128).cuda()
        fixed = fixed.expand(batch_size, 1, 128, 128, 128)

    fixed_norm = fixed / 255
    if use_gpu:
        fixed_norm = fixed_norm.to(device)

    for epoch in range(iters):
        start_time = time.time()
        loss_epoch = 0.0
        for i, batch_moving in enumerate(train_set):
            if use_gpu:
                batch_moving_cuda = batch_moving.cuda()
            else:
                batch_moving_cuda = batch_moving

            batch_moving_cuda_norm = batch_moving_cuda / 255

            wrap, flow = model(batch_moving_cuda_norm, fixed_norm)

            loss = loss_fun(wrap, fixed_norm) + reg_param * Grad_loss(flow)

            loss_epoch += loss.item()

            opt.zero_grad()
            loss.backward()
            opt.step()

        append_csv(save_loss_csv,
                   zip([[epoch + 1]], [loss_epoch]))
        end_time = time.time()
        loop_cost = end_time - start_time
        print("After [ {} ] seconds and {} epoches, selected the {} loss to train, the loss is [ {} ]."
              .format(loop_cost, epoch + 1, loss_name, loss_epoch / (2048 / batch_size)))

    para_save_file = save_loss_path + 'res/' + 'MyModel-slice-{}-{}-{}-{}.pth'.format(loss_name, iters, reg_param, now)
    if os.path.exists(para_save_file):
        os.remove(para_save_file)

    torch.save(model.state_dict(), para_save_file)
    print("The model saved in {}".format(para_save_file))

if __name__ == "__main__":

    args = parser.parse_args()

    now = datetime.now().date()
    json_path = '/home/mamingrui/code/MyModel/brain.json'
    atlas_file = '/home/mamingrui/data/atlas/atlas.npy',

    # initialize the process group
    dist.init_process_group("nccl")
    local_rank = torch.distributed.get_rank()
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)

    train(iters=args.iters,
          data_size=args.data_size,
          batch_size=args.batch_size,
          loss_name=args.loss_name,
          lr=args.lr,
          reg_param=args.reg_param,
          save_loss_path=args.save_loss_path,
          use_gpu=args.use_gpu)
    cleanup()

The errro report below

*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
usage: train.py [-h] [--iters ITERS] [--data_size DATA_SIZE]
                [--batch_size BATCH_SIZE] [--loss_name LOSS_NAME] [--lr LR]
                [--reg_param REG_PARAM] [--save_loss_path SAVE_LOSS_PATH]
                [--use_gpu USE_GPU]
train.py: error: unrecognized arguments: --local_rank=0
usage: train.py [-h] [--iters ITERS] [--data_size DATA_SIZE]
                [--batch_size BATCH_SIZE] [--loss_name LOSS_NAME] [--lr LR]
                [--reg_param REG_PARAM] [--save_loss_path SAVE_LOSS_PATH]
                [--use_gpu USE_GPU]
train.py: error: unrecognized arguments: --local_rank=1
Traceback (most recent call last):
  File "/home/mamingrui/anaconda3/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/mamingrui/anaconda3/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/mamingrui/anaconda3/lib/python3.7/site-packages/torch/distributed/launch.py", line 253, in <module>
    main()
  File "/home/mamingrui/anaconda3/lib/python3.7/site-packages/torch/distributed/launch.py", line 249, in main
    cmd=cmd)
subprocess.CalledProcessError: Command '['/home/mamingrui/anaconda3/bin/python', '-u', 'train.py', '--local_rank=1']' returned non-zero exit status 2.

mrshenli · June 2, 2020, 2:48am

The launcher will pass a --local_rank arg to your train.py script, so you need to add that to the ArgumentParser.

Besides. you need to pass that rank, and world_size, and init_method (which basically contains MASTER_ADDR and MASTER_PORT) to dist.init_process_group either through arguments or env vars.

This example might be helpful: https://github.com/pytorch/examples/pull/743

xinj96 · August 27, 2020, 1:38pm

I also met this problem, but I didn’t understand the answer upstairs. How did you solve this problem?
thank you

mrshenli · August 27, 2020, 2:45pm

The the error mentioned in the original post basically means that the launcher script tries to pass --local_rank=1 as an argument to your script (i.e., train.py in this case). However, train.py is not configured to accept that argument.

train.py: error: unrecognized arguments: --local_rank=1

To solve this issue, you can add the following to your ArgumentParser.

parser.add_argument("--local_rank", type=int, default=0)

Herb2333 · September 14, 2020, 2:14pm

thanks.but after i add parser.add_argument("–local_rank", type=int, default=0),this errors also occurred.

pritamqu · November 11, 2020, 8:31pm

I am facing the same problem, tried almost everything… any solution to this ??

Saurabh_Kataria · September 26, 2022, 11:29pm

Can you try using torchrun instead of torch.distributed.launch? Instead of python -m torch.distributed.launch --nproc_per_node=2 train.py, it should be something like torchrun --nproc_per_node=2 train.py.

mahdiabavisani · August 14, 2024, 7:05pm

Have you tried adding both
parser.add_argument("--local_rank", type=int, default=0)
and
parser.add_argument("--local-rank", type=int, default=0) ?

https://pytorch.org/docs/stable/distributed.html#launch-utility