I have two GPU servers, a docker is deployed on each GPU server, and each docker uses one GPU card to conduct DDP. And the IP addresses of the two dockers are 10.10.10.2 and 10.10.10.3, respectively. The script is as follows:
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, DistributedSampler
from torchvision import datasets, transforms
import os
def setup_distributed(local_rank, world_size, backend=‘nccl’):
rank = int(os.environ[‘RANK’])
dist.init_process_group(backend=backend, rank=rank, world_size=world_size)
torch.cuda.set_device(local_rank)
def cleanup():
dist.destroy_process_group()
class SimpleModel(nn.Module):
def init(self):
super(SimpleModel, self).init()
self.fc = nn.Linear(28 * 28, 10)
def forward(self, x):
x = x.view(-1, 28 * 28)
return self.fc(x)
def train(rank, world_size):
setup_distributed(rank, world_size)
# 设置数据集和分布式采样器
transform = transforms.Compose([transforms.ToTensor()])
dataset = datasets.MNIST(root='/workspace/data', train=True, download=False, transform=transform)
sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
dataloader = DataLoader(dataset, batch_size=64, sampler=sampler)
model = SimpleModel().cuda(rank)
model = nn.parallel.DistributedDataParallel(model, device_ids=[rank])
criterion = nn.CrossEntropyLoss().cuda(rank)
optimizer = optim.SGD(model.parameters(), lr=0.01)
for epoch in range(1):
model.train()
sampler.set_epoch(epoch)
for batch_idx, (data, target) in enumerate(dataloader):
data, target = data.cuda(rank), target.cuda(rank)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
cleanup()
if name == “main”:
local_rank = int(os.environ[‘LOCAL_RANK’])
world_size = int(os.environ[‘WORLD_SIZE’])
train(local_rank, world_size)
On docker A (10.10.10.2), we run:
RANK=0 WORLD_SIZE=2 MASTER_ADDR=10.10.10.2 MASTER_PORT=12345 LOCAL_RANK=0 python script.py &