When I have some multi-branch structures in my network, and put this network on multi gpu, the network won’t be trained and converge. Below is the full code to re-produce the problem
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils import spectral_norm
class Discriminator(nn.Module):
def __init__(self, ndf=64, c_dim=20, nc=3):
super(Discriminator, self).__init__()
self.block3 = nn.Sequential(
spectral_norm(nn.Conv2d(nc, ndf, 4, 2, 1)), nn.BatchNorm2d(ndf*1), nn.ReLU())
self.block2 = nn.Sequential(
spectral_norm(nn.Conv2d(ndf, ndf*2, 4, 2, 1)), nn.BatchNorm2d(ndf*2), nn.ReLU())
self.block1 = nn.Sequential(
spectral_norm(nn.Conv2d(ndf*2, ndf*4, 4, 2, 1)), nn.BatchNorm2d(ndf*4), nn.ReLU())
self.block0 = nn.Sequential(
spectral_norm(nn.Conv2d(ndf*4, ndf*8, 4, 2, 1)), nn.BatchNorm2d(ndf*8), nn.ReLU())
self.c = nn.Sequential(
spectral_norm(nn.Conv2d(ndf*8, c_dim, 4, 1, 0)),
)
self.rf = spectral_norm(nn.Conv2d(ndf*8, 1, 4, 1, 0))
def forward(self, x):
feat = self.block3(x)
feat = self.block2(feat)
feat = self.block1(feat)
feat = self.block0(feat)
rf = self.rf(feat).view(-1)
c = self.c(feat)
return rf, c
netD = Discriminator().cuda()
#netD = nn.DataParallel(netD)
opt_D = optim.Adam(netD.parameters(), lr=0.0001, betas=(0.5, 0.99))
g_image = torch.randn(64, 3, 64, 64).cuda()
real_image = torch.randn(64, 3, 64, 64).cuda()
for itx in range(100):
netD.zero_grad()
pred_f, _ = netD(g_image.detach())
pred_r,_ = netD(real_image)
d_loss = F.relu( 1-pred_r ).mean() + F.relu( 1+pred_f ).mean()
d_loss.backward()
opt_D.step()
print(d_loss.item())
If you comment the line
netD = nn.DataParallel(netD)
The model will be trained without any problem. For your convenience, this code will print out the loss value every iteration, so you know if it converges or not.
As you can see, in my model, I have “feat” goes through 2 different layers in the end, this is the cause of the “can not converge on nn.DataParallel” issue.