The GPU0 in my server has been occupied by others’ processes, so I blocked GPU0 and use mp.spawn
to train my model, but it failed to create train process.
This is reproducible example:
import torch
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
import os
import torch.distributed as dist
def train(rank, gpu_ids):
device_id = gpu_ids[rank]
dist.init_process_group(
backend="nccl", init_method="env://", world_size=len(gpu_ids), rank=device_id)
torch.cuda.set_device(device_id)
print(f"Rank {rank} is using device {device_id}")
model = nn.Linear(10, 1).to(device_id)
optimizer = optim.SGD(model.parameters(), lr=0.01)
input_data = torch.randn(5, 10).to(device_id)
target = torch.randn(5, 1).to(device_id)
for epoch in range(5):
optimizer.zero_grad()
output = model(input_data)
loss = nn.MSELoss()(output, target)
loss.backward()
optimizer.step()
print(f"Rank {rank}, Epoch {epoch}, Loss: {loss.item()}")
def test_main():
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2'
os.environ["MASTER_ADDR"] = '127.0.0.1'
os.environ["MASTER_PORT"] = '12345'
gpu_ids = [1]
world_size = len(gpu_ids)
mp.spawn(train, args=(gpu_ids,), nprocs=world_size, join=True)
In my test, the train()
method stucked in here:
dist.init_process_group(backend="nccl", init_method="env://", world_size=len(gpu_ids), rank=device_id)
So what happened and how to solve it?