I am trying to implement a PyTorch DDP multi-GPU training script using LeNet-5 CNN and MNIST dataset as:
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
dist.init_process_group("gloo", rank = rank, world_size = world_size)
def cleanup():
dist.destroy_process_group()
class LeNet5(nn.Module):
'''
Implements a variation of LeNet-5 CNN. It is LeNet-4.
'''
def __init__(self):
super(LeNet5, self).__init__()
self.conv1 = nn.Conv2d(
in_channels = 1, out_channels = 6,
kernel_size = 3, padding = 1,
stride = 1
)
self.conv2 = nn.Conv2d(
in_channels = 6, out_channels = 16,
kernel_size = 3, padding = 1,
stride = 1
)
self.conv3 = nn.Conv2d(
in_channels = 16, out_channels = 120,
kernel_size = 3, padding = 1,
stride = 1
)
self.pool = nn.MaxPool2d(
kernel_size = 2, stride = 2
)
self.flatten = nn.Flatten()
# self.fc1 = nn.Linear(in_features = 512, out_features = 256)
# self.fc2 = nn.Linear(in_features = 120, out_features = 84)
# self.op = nn.Linear(in_features = 84, out_features = 10)
self.op = nn.Linear(in_features = 1080, out_features = 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = self.flatten(x)
return self.op(x)
def demo_basic(rank, world_size):
print(f"Running basic DDP example on rank {rank}.")
setup(rank, world_size)
# Get MNIST data-
train_loader, test_loader, train_dataset, test_dataset = get_mnist_dataset(
path_to_files = "/home/majumdar/Downloads/.data/", batch_size = 1024,
pin_memory = False, num_workers = 4
)
# create model and move it to GPU with id rank
# model = ToyModel().to(rank)
model = LeNet5().to(rank)
ddp_model = DDP(model, device_ids = [rank])
# Define cost function and optimizer-
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(params = ddp_model.parameters(), lr = 0.001)
num_epochs = 20
for epoch in range(num_epochs):
# One epoch of training-
running_loss = 0.0
running_corrects = 0.0
for images, labels in train_loader:
images = images.to(rank)
labels = labels.to(rank)
optimizer.zero_grad()
# outputs = ddp_model(torch.randn(20, 10))
outputs = ddp_model(images)
# labels = torch.randn(20, 5).to(rank)
# loss_fn(outputs, labels).backward()
J = loss_fn(outputs, labels)
J.backward()
optimizer.step()
# Compute model's performance statistics-
running_loss += J.item() * images.size(0)
_, predicted = torch.max(outputs, 1)
running_corrects += torch.sum(predicted == labels.data)
# Compute training loss and accuracy for one epoch-
epoch_loss = running_loss / len(train_dataset)
epoch_acc = running_corrects.double() / len(train_dataset)
print(f"epoch: {epoch + 1}; loss = {epoch_loss:.4f} & acc = {epoch_acc * 100:.2f}%")
cleanup()
def run_demo(demo_fn, world_size):
mp.spawn(
demo_fn,
args = (world_size,),
nprocs = world_size,
join = True
)
if __name__ == "__main__":
n_gpus = torch.cuda.device_count()
assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
world_size = n_gpus
run_demo(demo_basic, world_size)
But at the end of 20 epochs of training, the model achieves an accuracy of 1.10% which is dreadful. If I run the same code without DDP multi-GPU training, then the accuracy achieved is close to 95%.
What am I missing?