AttributeError: 'NoneType' object has no attribute 'is_failed'

I run a DL code on a machine with 4 GPUs. This is the sbatch script

#!/bin/sh
#SBATCH --cpus-per-task=8
#SBATCH --ntasks-per-node=4
#SBATCH --mem-per-cpu=8G
#SBATCH --gpus-per-node=4
#SBATCH --time=0-00:30:00
#SBATCH --output=logs/%x.out
#SBATCH --error=logs/%x.out
#SBATCH --export=NONE
echo "START TIME: $(date)"

common_torchrun_cmd="torchrun \
    --nnodes $NUM_NODES \
    --nproc_per_node 4 \
    --rdzv_id $RANDOM \
    --rdzv_endpoint $MASTER_NODE_IP:$MASTER_PORT \
    --rdzv_backend c10d \
    cifar10_lightning.py \
    --epochs=3 \
    --batch-size=$BATCH \
    --exp-name=$EXP_NAME \
    --num-nodes=$NUM_NODES \
    --strategy=$STRATEGY \
    --model=$MODEL \
    --precision=$PRECISION"

srun $common_torchrun_cmd

This is the code I am using (PytorchLightning)

def main():
    seed_everything(42)  # for reproducibility

    model = Model(loss_function=nn.CrossEntropyLoss(), num_classes=10)

    trainer = Trainer(
        max_epochs=args.epochs,
        strategy=strategies[args.strategy],
        accelerator="gpu",
        devices=4,
        logger=logger,
        enable_progress_bar=True,
        num_nodes=args.num_nodes,
        log_every_n_steps=1,
        enable_model_summary=True,
        detect_anomaly=False,
        enable_checkpointing=False,
        plugins=get_plugins(args)
    )

    trainer.fit(model)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--exp-name", type=str, default="test")
    parser.add_argument("--strategy", type=str, default="auto")
    parser.add_argument("--num-nodes", type=int, default=1)
    parser.add_argument("--precision", type=str, default="32")
    parser.add_argument("--model", type=str, default="resnet18")
    args = parser.parse_args()
    main()

And this is the error I am facing

Traceback (most recent call last):                                    
  File "/home/3458/pytorch/venv/bin/torchrun", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 879, in main
    run(args)
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 870, in run
    elastic_launch(
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 258, in launch_agent
    if result.is_failed():
       ^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'is_failed'
Traceback (most recent call last):
  File "/home/3458/pytorch/venv/bin/torchrun", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 879, in main
    run(args)
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 870, in run
    elastic_launch(
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 258, in launch_agent
    if result.is_failed():
       ^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'is_failed'

How can I fix it?

I am trying to understand if this is error is from torchrun or lightening

would you mind wrting replace the training loop with with pytorch native code just to verify if torchrun is setup correctly

command: torchrun --standalone --nproc_per_node=2 test_torchrun.py

import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP

# Example model
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 10)

    def forward(self, x):
        return self.fc(x)

def setup():
    dist.init_process_group(backend="nccl")

def cleanup():
    dist.destroy_process_group()

def main():
    # Get the rank of the current process
    rank = int(os.environ['RANK'])
    
    # Get the world size (total number of processes)
    world_size = int(os.environ['WORLD_SIZE'])

    # Setup distributed environment
    setup()

    # Assign the correct GPU to each process
    torch.cuda.set_device(rank)
    
    model = SimpleModel().to(rank)
    ddp_model = DDP(model, device_ids=[rank])

    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    # Dummy input and target for training loop
    input = torch.randn(20, 10).to(rank)
    target = torch.randn(20, 10).to(rank)

    for epoch in range(10):
        optimizer.zero_grad()
        output = ddp_model(input)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        if rank == 0:
            print(f"Epoch {epoch}, Loss: {loss.item()}")

    cleanup()

if __name__ == "__main__":
    main()

With this command

torchrun --standalone --nproc_per_node=2 test_torchrun.py

It ran fine.

But with this command

srun torchrun\
    --nnodes $NUM_NODES \
    --nproc_per_node 2 \
    --rdzv_id $RANDOM \
    --rdzv_endpoint $MASTER_NODE_IP:$MASTER_PORT \
    --rdzv_backend c10d \
test_torchrun.py

it showed the same error.

Traceback (most recent call last):
  File "/home/3458/pytorch/venv/bin/torchrun", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 901, in main
    run(args)
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in run
    elastic_launch(
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent
    if result.is_failed():
       ^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'is_failed'
srun: error: r2v01: task 0: Exited with exit code 1

this indicates setup problem of slrum. could you follow GitHub - pytorch/torchtitan: A native PyTorch Library for large model training which contains an example multinode_trainer.slurm, and see if srun works correctly?

I followed the example and still there is an error

here is my sbatch

#!/bin/sh
#SBATCH --job-name=torchtitan_multi_node
#SBATCH --ntasks-per-node=2
#SBATCH --nodes=1
#SBATCH --gpus-per-node=2
#SBATCH --cpus-per-task=4
#SBATCH --output=logs/%x.out
#SBATCH --error=logs/%x.out

nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)

echo Node IP: $head_node_ip
export LOGLEVEL=INFO
export NCCL_IB_DISABLE=1

export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
export CUDA_LAUNCH_BLOCKING=0

export NCCL_SOCKET_IFNAME="eth0"
export NCCL_BUFFSIZE=2097152

vpkg_devrequire anaconda/2024.02:python3
vpkg_devrequire cuda/12.4.1

source venv/bin/activate

srun torchrun --nnodes 1 --nproc_per_node 2 --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:29500" test.py

here is my code

import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP

# Example model
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(10, 10)

    def forward(self, x):
        return self.fc(x)

def setup():
    dist.init_process_group(backend="nccl")

def cleanup():
    dist.destroy_process_group()

def main():
    # Get the rank of the current process
    rank = int(os.environ['RANK'])
    
    # Get the world size (total number of processes)
    world_size = int(os.environ['WORLD_SIZE'])

    # Setup distributed environment
    setup()

    # Assign the correct GPU to each process
    torch.cuda.set_device(rank)
    
    model = SimpleModel().to(rank)
    ddp_model = DDP(model, device_ids=[rank])

    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    # Dummy input and target for training loop
    input = torch.randn(20, 10).to(rank)
    target = torch.randn(20, 10).to(rank)

    for epoch in range(10):
        optimizer.zero_grad()
        output = ddp_model(input)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        if rank == 0:
            print(f"Epoch {epoch}, Loss: {loss.item()}")

    cleanup()

if __name__ == "__main__":
    main()

and it shows the same error

I0821 12:00:00.491000 47676191742144 torch/distributed/elastic/agent/server/api.py:686] Rendezvous gracefully exited: The rendezvous '101' is closed, terminating pending rendezvous.
Traceback (most recent call last):
  File "/home/3458/pytorch/venv/bin/torchrun", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 348, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 901, in main
    run(args)
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in run
    elastic_launch(
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 133, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent
    if result.is_failed():
       ^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'is_failed'
srun: error: r2v01: task 0: Exited with exit code 1

@weifengpy any news?