When I launch the following script with the torch.distributed.launch
utilility on a 2 GPUs machine, I get a much slower (10x) training than when I launch it on a single GPU.
I realized that it seems to come from the big fully connected layer at the end of the network (130000x1024), and I suppose it is because the gradients that need to be synchronized at each iteration represent a big amount of memory.
I profiled the code with Nvidia Nsight Systems and saw that there is a call to ncclAllReduceRingLLKernel_sum_f32
that takes approximately 500 ms each iteration.
Is this expected behaviour with this kind of network? Or am I doing something wrong?
import torch
import torch.nn as nn
import argparse
from torch.nn.parallel import DistributedDataParallel as DPP
import torch.nn.functional as F
from tqdm import tqdm
class Discriminator(nn.Module):
def __init__(self):
super(Discriminator, self).__init__()
in_channels = 3
out_channels = 64
depth = 7
m_features = [
nn.Conv2d(in_channels, out_channels, 3, padding=1),
]
for i in range(depth):
in_channels = out_channels
if i % 2 == 1:
stride = 1
out_channels *= 2
else:
stride = 2
m_features.append(nn.Conv2d(
in_channels, out_channels, 3, padding=1, stride=stride,
))
self.features = nn.Sequential(*m_features)
patch_size = 256 // (2 ** ((depth + 1) // 2))
m_classifier = [
nn.Linear(out_channels * patch_size ** 2, 1024),
nn.LeakyReLU(negative_slope=0.2, inplace=False),
nn.Linear(1024, 1)
]
self.classifier = nn.Sequential(*m_classifier)
def forward(self, f0):
features = self.features(f0)
output = self.classifier(features.view(features.size(0), -1))
return output
torch.backends.cudnn.enabled = True # make sure to use cudnn for computational performance
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, default=0)
args = parser.parse_args()
def train(rank, world_size):
if world_size > 1:
torch.distributed.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
discriminator = Discriminator()
discriminator.to(rank)
optimizer = torch.optim.Adam(
discriminator.parameters(),
lr=1e-5
)
# -- Initialize model for distributed training --
if torch.cuda.device_count() > 1:
discriminator = DPP(discriminator, device_ids=[rank])
frame = torch.rand((1, 3, 256, 256), device=f"cuda:{rank}")
d_01 = discriminator(frame)
label_01 = torch.zeros_like(d_01)
for i in tqdm(range(30)):
# - Compute loss -
d_01 = discriminator(frame)
loss = F.binary_cross_entropy_with_logits(d_01, label_01)
optimizer.zero_grad()
loss.backward()
optimizer.step()
def main():
world_size = torch.cuda.device_count()
with torch.autograd.profiler.emit_nvtx():
train(args.local_rank, world_size)
if __name__ == '__main__':
main()