Issue with Multiple GPU loss convergence

Hi everyone

I am trying to do multiple GPU training using a CNN for text classification. The model works fine using single GPU. But when I am trying to use multiple GPUs. I see that all my GPUs have some memory filled but only my GPU 0 has volatile GPU usage. Also using multiple gpus, my training and validation scores arent going down at all. But it works well with single GPU.

Can someone point me in the right direction?

System Configurations:
Pytorch 0.4
Python 3
4 GPUs
each has 16 GB memory

My CNN model
class CNN(nn.Module):
    def __init__(self, vocab, filters = [(3,100),(4,100),(5,100),(7,100)], dp = 0.3, bn=True, nl_func=F.relu):
        super(CNN, self).__init__()
        V,D = vocab.vectors.size()
        self.embed = nn.Embedding(V,D)
        self.embed.weight.requires_grad = False
        self.conv_layers = nn.ModuleList([nn.Conv1d(in_channels=D, out_channels=n, kernel_size=ksz) for ksz,n in filters])
        self.amp = nn.AdaptiveMaxPool1d(1)
        self.dp = nn.Dropout(dp)
        self.nl_func = nl_func
        num_features = 0
        for _,n in filters:
            num_features += n 
        self.bnz = nn.BatchNorm1d(num_features) if bn else lambda x: x
        self.fc = nn.Linear(num_features,1)
    def forward(self,text,label):
        text_embed = self.embed(text).transpose(1,2).transpose(0,2)
        conv = [self.amp(self.nl_func(Conv_layer(text_embed))) for Conv_layer in self.conv_layers] 
        concat_conv =,2)
        concat_conv = concat_conv.view(concat_conv.size(0),-1)
        bn_concat_conv = self.bnz(concat_conv)
        dp_bn_concat_conv = self.dp(bn_concat_conv)
        out = self.fc(dp_bn_concat_conv)
        out = torch.sigmoid(out).transpose(0,1)
        return out

function_dict = {'relu':F.relu}
def get_model(vocab,config,model_type='CNN'):
    if model_type == 'CNN':
        lr = config["lr"]
        dp = config["dp"]
        filter_szs = config["filter_szs"]
        filter_mp = config["filter_mp"]
        nl = function_dict.get(config["nl_func"],None)
        filters = [(f,filter_mp) for f in filter_szs]
        model_name = f"emb_name_{config['embed_name']}_lr_{lr}_dp_{dp}_filters_{filter_szs}_filter_mps_{filter_mp}.pth"
        print(f"Configuration: Embedding_name-{config['embed_name']} lr-{lr}, dropout-{dp}, filter_szs-{filter_szs}, no of filters-{filter_mp}")
        model = CNN(vocab, filters = filters, dp = dp, bn=True, nl_func=nl).cuda()
       model = torch.nn.DataParallel(model)
        return model, model_name

This is my training class
class TrainClassifier:
    def __init__(self,trn_dl=None,val_dl=None):
        self.train_loader = trn_dl
        self.val_loader = val_dl 
    def _train_iteration(self,model):
        # Setting model in train mode
        total,sum_loss = 0,0
        loss_function = nn.BCELoss(reduction='none').cuda()
        for x_text,label in self.train_loader:
            bsz = label.size(1)
            x_text = x_text.cuda()
            y = label.cuda() 
            y_hat = model(x_text,y)
            loss = loss_function(y_hat,y).squeeze(0).sum()
            loss = loss.sum()/bsz
            total += bsz
            sum_loss += bsz*(loss.item())
        return sum_loss/total

    def test(self,model,test_dl=None):
        test_dl = test_dl if test_dl else self.val_loader
        total, sum_loss = 0,0
        yT,ypT = torch.tensor([]),torch.tensor([])
        loss_function = nn.BCELoss(reduction='none').cuda()
        for x_text,label in test_dl:
            bsz = label.size(1)
            x_text = x_text.cuda()
            y = label.cuda() 
            y_hat = model(x_text,y)
            loss = loss_function(y_hat,y).squeeze(0).sum()
            loss = loss.sum()/bsz
            sum_loss += bsz*(loss.item())
            total += bsz
            yT =,y.squeeze(0).cpu()),0)
            ypT =,y_hat.squeeze(0).cpu()),0)
        return sum_loss/total,t2n(yT),t2n(ypT)
    def train(self,model,config,model_name='',verbose=True,final=True):
        if self.train_loader is None or self.val_loader is None:
            print("Check whether you have passed train and test loader")
        self.optim = get_optimizer(model,config['lr'],wd=0)        
        # loading a model if present
        model, self.optim, start_epoch = load_checkpoint(model, self.optim, model_name)
        if start_epoch < config['epochs']-1:  
            for i in tqdm(range(start_epoch+1,config['epochs']),desc='Epochs:',dynamic_ncols=True): 
                train_loss = self._train_iteration(model)          
                vld_loss, truth, pred = self.test(model)            
                precision, recall, f1 = classification_report(truth, pred,threshold=0.5)
                if verbose:
                    tqdm.write("Training BCE : %f 'Validation BCE: %f"%(train_loss,vld_loss))
                    tqdm.write(f'test precision @ 0.5:{precision}, test recall @ 0.5:{recall}')
            if final:
        return model

I solved my issue. Since batch wasnt my first dimension, I had to mention dim=1 in the data parallel, that is the dimension I need to scatter my inputs.