Hello,
There is something I seem to struggle to understand regarding how to use the DistributedDataParallel correctly. Below is a reproducible example of my code (I tried to make it as short and general as possible, and removed the evaluation step from the training).
I’m running the code on a machine with two GPUs, and my problem is that the code will save two separate torch models, one for each GPU process I’ve spawned. My assumption is that distributed multiprocessing should eventually reconvene everything under the same model, and so a single model should be saved by the code after the training. Could someone please tell me what I’m doing wrong in the code below?
Thanks in advance
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
import torch.multiprocessing as mp
from argparse import ArgumentParser
import os
class MyModel(nn.Module):
def __init__(self, input_dim, inner_layer_1, inner_layer_2, output_dim):
super().__init__()
self.fc1 = nn.Linear(input_dim, inner_layer_1)
self.fc2 = nn.Linear(inner_layer_1, inner_layer_2)
self.fc3 = nn.Linear(inner_layer_2, output_dim)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
x = F.softmax(x, dim=1)
return x
def train(gpu_number, n_epochs, model, train_data, optimizer, loss_fn, log_interval=2):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29500'
torch.distributed.init_process_group(
backend='nccl',
init_method='env://',
world_size=2, # total number of gpus
rank=gpu_number
)
sampler = DistributedSampler(train_data, num_replicas=2, rank=gpu_number)
trainloader = DataLoader(train_data, batch_size=8, sampler=sampler)
#torch.cuda.set_device(gpu_number)
model = model.cuda(gpu_number)
model = DDP(model, device_ids=[gpu_number], output_device=gpu_number)
for epoch in range(n_epochs):
for i, batch in enumerate(trainloader):
inputs, labels = batch[:,:8].cuda(gpu_number), batch[:,-2:].cuda(gpu_number)
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
torch.save(model.state_dict(), f"model_{gpu_number}.pt")
if __name__ == "__main__":
train_data = torch.rand(100, 10)
n_epochs = 3
learning_rate = 0.001
model = MyModel(8, 800, 300, 2)
loss_fn = nn.MSELoss() # use nn.CrossEntropyLoss() for binary classification
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
mp.spawn(train, nprocs=2, args=(n_epochs, model, train_data, optimizer, loss_fn, 2))