Model parallelization on a multi-gpu setup not working

Hi,

I have a very deep model and I am trying to implement model sharding as its not working on a single GPU. Code is below:

class ConvolutionalAutoEncoder(nn.Module):
    def __init__(self):
        super(ConvolutionalAutoEncoder, self).__init__()
        self.encoder_block1 = nn.Sequential(
            nn.Conv2d(1, 64, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.Conv2d(64, 64, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.BatchNorm2d(64) 
        )
        
        self.encoder_block2 = nn.Sequential(
            nn.Conv2d(64, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.Conv2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.BatchNorm2d(128)
        )
        
        self.encoder_block3 = nn.Sequential(
            nn.Conv2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.Conv2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.Conv2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.BatchNorm2d(128)
        )
        
        self.encoder_block4 = nn.Sequential(
            nn.Conv2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.Conv2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.Conv2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.BatchNorm2d(128)
        )
        
        self.decoder_block4 = nn.Sequential(            
            nn.ConvTranspose2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.ConvTranspose2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.ConvTranspose2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.BatchNorm2d(128)
        )
        
        self.decoder_block3 = nn.Sequential(            
            nn.ConvTranspose2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.ConvTranspose2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.ConvTranspose2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.BatchNorm2d(128)
        )  
        
        self.decoder_block2 = nn.Sequential(            
            nn.ConvTranspose2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.ConvTranspose2d(128, 128, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.BatchNorm2d(128)
        )
        
        self.decoder_block1 = nn.Sequential(   
            nn.ConvTranspose2d(128, 64, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.ConvTranspose2d(64, 64, 3, stride=1, padding=1),
            nn.Tanh(),
            nn.BatchNorm2d(64)
         )
        
        self.decoder_block0 = nn.Sequential(  
            nn.ConvTranspose2d(64, 1, 3, stride=1, padding=1),
        )
        
        self.encoder_block1 = self.encoder_block1.to('cuda:0')
        self.encoder_block2 = self.encoder_block2.to('cuda:0')
        self.encoder_block3 = self.encoder_block3.to('cuda:1')
        self.encoder_block4 = self.encoder_block4.to('cuda:1')
        self.decoder_block4 = self.decoder_block4.to('cuda:2')
        self.decoder_block3 = self.decoder_block3.to('cuda:2')
        self.decoder_block2 = self.decoder_block2.to('cuda:3')
        self.decoder_block1 = self.decoder_block1.to('cuda:3')
        self.decoder_block0 = self.decoder_block0.to('cuda:3')
        
    def forward(self, x):
        x1 = self.encoder_block1(x)
        x2 = self.encoder_block2(x1)
        x2 = x2.to('cuda:1')
        x3 = self.encoder_block3(x2)
        x4 = self.encoder_block4(x3)
        x4 = x4.to('cuda:2')
        x3 = x3.to('cuda:2')
        y4 = self.decoder_block4(x4)
        y3 = self.decoder_block3(y4+x3)
        y3 = y3.to('cuda:3')
        x2 = x2.to('cuda:3')
        y2 = self.decoder_block2(y3+x2)
        y1 = self.decoder_block1(y2)
        y0 = self.decoder_block0(y1)
        return y0

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model = ConvolutionalAutoEncoder()
learning_rate = 0.001
weight_decay = 0.1
momentum = 0.9
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

params = list(model.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight

num_epochs = 30
for epoch in range(num_epochs):
    for i, data in enumerate(train_loader):
        inp, targ = data
        inp = inp.to(device)
        targ = targ.to(device)

        output = model(inp)
        output = output.to(device)
        optimizer.zero_grad()
        loss = F.binary_cross_entropy_with_logits(output, targ)
        loss.backward()
        optimizer.step()
               
        if i % 10 == 0:
            for param_group in optimizer.param_groups:
                print("Current learning rate is: {}".format(param_group['lr']))
            print("Epoch[{}/{}]({}/{}): Loss: {:.4f}".format(epoch+1,num_epochs, i, len(train_loader), loss.item()))

The code runs and I think the forward pass works well. I can see the load being passed to the different GPU’s but after one pass it stops working and the code just hangs. I cant even kill it and the only way to get out is to force reboot.

Screenshot of nvidia-smi also attached:
image

No matter how much I try to kill the process (7711), it does not stop.

Is there something wrong with my code?

Please help.

EDIT:

Ok. So I think I solved the problem. I was computing the loss function in cuda 0 but the last layer was in cuda 3, hence I think it was causing problem while backward propagating (not sure why it didnt throw an error but just froze completely).

So I moved the loss function also to cuda 3 and it worked!!! :slight_smile:
But I dont know why but I am not able to print the loss value i.e.

if i % 10 == 0:
            print("Epoch[{}/{}]({}/{}): Loss: {:.4f}".format(epoch+1,num_epochs, i, len(train_loader), loss.item()))

The code’s freezing up in this part. Why is just printing the loss causing a problem?

Seems strange. Cannot help at all but if I had to I would recommend to check where the model parameters are stored. Sayin’ that cos computations usually requires to be in the same gpu. Using dataparallel model parameters are allocated in a single gpu, therefore working this way may produce an unseen bug. Anyway you can try to use a toy model to see if you can emulate the same behaviour.