I run a DL code on a machine with 4 GPUs. This is the sbatch script
#!/bin/sh
#SBATCH --cpus-per-task=8
#SBATCH --ntasks-per-node=4
#SBATCH --mem-per-cpu=8G
#SBATCH --gpus-per-node=4
#SBATCH --time=0-00:30:00
#SBATCH --output=logs/%x.out
#SBATCH --error=logs/%x.out
#SBATCH --export=NONE
echo "START TIME: $(date)"
common_torchrun_cmd="torchrun \
--nnodes $NUM_NODES \
--nproc_per_node 4 \
--rdzv_id $RANDOM \
--rdzv_endpoint $MASTER_NODE_IP:$MASTER_PORT \
--rdzv_backend c10d \
cifar10_lightning.py \
--epochs=3 \
--batch-size=$BATCH \
--exp-name=$EXP_NAME \
--num-nodes=$NUM_NODES \
--strategy=$STRATEGY \
--model=$MODEL \
--precision=$PRECISION"
srun $common_torchrun_cmd
This is the code I am using (PytorchLightning)
def main():
seed_everything(42) # for reproducibility
model = Model(loss_function=nn.CrossEntropyLoss(), num_classes=10)
trainer = Trainer(
max_epochs=args.epochs,
strategy=strategies[args.strategy],
accelerator="gpu",
devices=4,
logger=logger,
enable_progress_bar=True,
num_nodes=args.num_nodes,
log_every_n_steps=1,
enable_model_summary=True,
detect_anomaly=False,
enable_checkpointing=False,
plugins=get_plugins(args)
)
trainer.fit(model)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument("--batch-size", type=int, default=32)
parser.add_argument("--exp-name", type=str, default="test")
parser.add_argument("--strategy", type=str, default="auto")
parser.add_argument("--num-nodes", type=int, default=1)
parser.add_argument("--precision", type=str, default="32")
parser.add_argument("--model", type=str, default="resnet18")
args = parser.parse_args()
main()
And this is the error I am facing
Traceback (most recent call last):
File "/home/3458/pytorch/venv/bin/torchrun", line 8, in <module>
sys.exit(main())
^^^^^^
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 879, in main
run(args)
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 870, in run
elastic_launch(
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 258, in launch_agent
if result.is_failed():
^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'is_failed'
Traceback (most recent call last):
File "/home/3458/pytorch/venv/bin/torchrun", line 8, in <module>
sys.exit(main())
^^^^^^
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 879, in main
run(args)
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/run.py", line 870, in run
elastic_launch(
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/3458/pytorch/venv/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 258, in launch_agent
if result.is_failed():
^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'is_failed'
How can I fix it?