Shallow Vs Deep Network

I read many documents and discussion forum that deep network is more prone to weight vanishing than shallow network. When i try to check loss between deep network and shallow network for the same data , both has more or less same loss for fist couple of epochs. Why it so , why gradient vanishing is not happening in deep network which cause to increase in loss

Deep Network

class model(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin1 = nn.Linear(118,100)
        self.lin2 = nn.Linear(100,90)
        self.lin3 = nn.Linear(90,85)                
        self.lin4 = nn.Linear(85,70)
        self.lin5 = nn.Linear(70,65)
        self.lin6 = nn.Linear(65,60)
        self.lin7 = nn.Linear(60,55)
        self.lin8 = nn.Linear(55,50)
        self.lin9 = nn.Linear(50,45)
        self.lin10 = nn.Linear(45,40)
        self.lin11 = nn.Linear(40,35)
        self.lin12 = nn.Linear(35,30)
        self.lin13 = nn.Linear(30,25)
        self.lin14 = nn.Linear(25,20)
        self.lin15 = nn.Linear(20,18)
        self.lin16 = nn.Linear(18,16)
        self.lin17 = nn.Linear(16,12)
        self.lin18 = nn.Linear(12,10)
        self.lin19 = nn.Linear(10,8)
        self.lin20 = nn.Linear(8,6)
        self.lin21 = nn.Linear(6,4)
        self.lin22 = nn.Linear(4,2)

        self.bn1 = nn.BatchNorm1d(100)
        self.bn2 = nn.BatchNorm1d(90)
        self.bn3 = nn.BatchNorm1d(85)
        self.bn4 = nn.BatchNorm1d(70)
        self.bn5 = nn.BatchNorm1d(65)
        self.bn6 = nn.BatchNorm1d(60)
        self.bn7 = nn.BatchNorm1d(55)
        self.bn8 = nn.BatchNorm1d(50)
        self.bn9 = nn.BatchNorm1d(45)
        self.bn10 = nn.BatchNorm1d(40)
        self.bn11 = nn.BatchNorm1d(35)
        self.bn12 = nn.BatchNorm1d(30)
        self.bn13 = nn.BatchNorm1d(25)
        self.bn14 = nn.BatchNorm1d(20)
        self.bn15 = nn.BatchNorm1d(18)
        self.bn16 = nn.BatchNorm1d(16)
        self.bn17 = nn.BatchNorm1d(12)
        self.bn18 = nn.BatchNorm1d(10)
        self.bn19 = nn.BatchNorm1d(8)
        self.bn20 = nn.BatchNorm1d(6)
        self.bn21 = nn.BatchNorm1d(4)
        
    def forward(self, x):
        x = self.lin1(x) 
        x = self.bn1(x) 
        x = F.sigmoid(x)
        #x = F.relu(torch.cat([x,res],1))
        x = self.lin2(x)
        x = self.bn2(x) 
        x = F.sigmoid(x)

        x = self.lin3(x)
        x = self.bn3(x) 
        x = F.sigmoid(x)
        
        x = self.lin4(x)
        x = self.bn4(x) 
        x = F.sigmoid(x)

        x = self.lin5(x)
        x = self.bn5(x) 
        x = F.sigmoid(x)
        
        x = self.lin6(x)
        x = self.bn6(x) 
        x = F.sigmoid(x)        
        
        x = self.lin7(x)
        x = self.bn7(x) 
        x = F.sigmoid(x)
        
        x = self.lin8(x)
        x = self.bn8(x) 
        x = F.sigmoid(x)
        
        x = self.lin9(x)
        x = self.bn9(x) 
        x = F.relu(x)
        
        x = self.lin10(x)
        x = self.bn10(x) 
        x = F.sigmoid(x)        
        
        x = self.lin11(x)
        x = self.bn11(x) 
        x = F.sigmoid(x)        
        
        x = self.lin12(x)
        x = self.bn12(x) 
        x = F.sigmoid(x)        
        
        x = self.lin13(x)
        x = self.bn13(x) 
        x = F.sigmoid(x)
        
        x = self.lin14(x)
        x = self.bn14(x) 
        x = F.sigmoid(x)
        
        x = self.lin15(x)
        x = self.bn15(x) 
        x = F.sigmoid(x)     
        
        x = self.lin16(x)
        x = self.bn16(x) 
        x = F.sigmoid(x)     
        
        x = self.lin17(x)
        x = self.bn17(x) 
        x = F.sigmoid(x)     
        
        x = self.lin18(x)
        x = self.bn18(x) 
        x = F.sigmoid(x)     
        
        x = self.lin19(x)
        x = self.bn19(x) 
        x = F.sigmoid(x)     
        
        x = self.lin20(x)
        x = self.bn20(x) 
        x = F.sigmoid(x)     
        
        x = self.lin21(x)
        x = self.bn21(x) 
        x = F.sigmoid(x)     
        
        x = self.lin22(x)        
        return x

Optimizer

def get_optimizer(model, lr):

    optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.01) # 1e-2 , 0.01,0.02,0.03,0.04
    my_lr_scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=1000, gamma=0.1)
  
    return optim,my_lr_scheduler

Training Model

def train_loop(model, epochs, lr):
    sum_loss = 0
    output   = 0
            
    criterion = nn.CrossEntropyLoss()
    optim,my_lr_scheduler = get_optimizer(model, lr = lr)
    
    for epoch in range(epochs):
        preds = []
        
        for i, (inputs,labels) in enumerate(dataloader):
            
            # forward backward , update wieghts
            model.train()
            batch = labels.shape[0]            
            output = model(inputs)
            _,pred = torch.max(output,1)
            loss = criterion(output, labels)
            optim.zero_grad()
            loss.backward()
            optim.step()
            sum_loss += batch*(loss.item())
            
            if (i+1) % 10 ==0:
                print(f'epoch : {epoch+1},training loss : {loss}')
                
batch_size = 9000
dataset= Dataset(X,y)
dataloader= DataLoader(dataset=dataset, batch_size=batch_size,shuffle=True)

total_samples = len(dataset)
n_iterations  = math.ceil(total_samples/batch_size)
train_loop(model, epochs=5, lr=0.01)
epoch : 1,training loss : 0.6930996775627136
epoch : 1,training loss : 0.6933792233467102
epoch : 1,training loss : 0.6929088830947876
epoch : 1,training loss : 0.6930387616157532
epoch : 1,training loss : 0.6932080984115601
epoch : 1,training loss : 0.6932775378227234
epoch : 1,training loss : 0.6931776404380798
epoch : 1,training loss : 0.6929720640182495
epoch : 1,training loss : 0.6922922134399414
epoch : 1,training loss : 0.6927770972251892
epoch : 2,training loss : 0.6929906606674194
epoch : 2,training loss : 0.6931278705596924
epoch : 2,training loss : 0.6935908794403076
epoch : 2,training loss : 0.6931191682815552
epoch : 2,training loss : 0.6931272745132446
epoch : 2,training loss : 0.6933028697967529
epoch : 2,training loss : 0.6931471228599548
epoch : 2,training loss : 0.6932592988014221
epoch : 2,training loss : 0.6932491064071655
epoch : 2,training loss : 0.6931505799293518
epoch : 3,training loss : 0.6932841539382935
epoch : 3,training loss : 0.6931620240211487
epoch : 3,training loss : 0.693145215511322
epoch : 3,training loss : 0.6931430101394653
epoch : 3,training loss : 0.693149745464325
epoch : 3,training loss : 0.6933196187019348
epoch : 3,training loss : 0.6931200623512268
epoch : 3,training loss : 0.69309401512146
epoch : 3,training loss : 0.693195104598999
epoch : 3,training loss : 0.6931386590003967
epoch : 4,training loss : 0.6932438611984253
epoch : 4,training loss : 0.6931557655334473
epoch : 4,training loss : 0.6932560205459595
epoch : 4,training loss : 0.6931642293930054
epoch : 4,training loss : 0.6935083270072937
epoch : 4,training loss : 0.6931833028793335
epoch : 4,training loss : 0.6933455467224121
epoch : 4,training loss : 0.6932011246681213
epoch : 4,training loss : 0.6931713819503784
epoch : 4,training loss : 0.6936143636703491
epoch : 5,training loss : 0.6932041645050049
epoch : 5,training loss : 0.6931434273719788
epoch : 5,training loss : 0.6931551098823547
epoch : 5,training loss : 0.6929137110710144
epoch : 5,training loss : 0.6931952834129333
epoch : 5,training loss : 0.6932055950164795
epoch : 5,training loss : 0.6933173537254333
epoch : 5,training loss : 0.6933751702308655
epoch : 5,training loss : 0.6931929588317871
epoch : 5,training loss : 0.6933628916740417

Shallow Network

class model(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin1 = nn.Linear(118,100)
        self.lin2 = nn.Linear(100,40)
        self.lin3 = nn.Linear(40,2)                
        
        self.bn1 = nn.BatchNorm1d(100)
        self.bn2 = nn.BatchNorm1d(40)

    def forward(self, x):

        x = self.lin1(x) 
        x = self.bn1(x) 
        x = F.sigmoid(x)
        
        x = self.lin2(x)
        x = self.bn2(x) 
        x = F.sigmoid(x)
        
        x = self.lin3(x)        
        return x
model = model()
to_device(model, device)

Training

batch_size = 9000
dataset= Dataset(X,y)
dataloader= DataLoader(dataset=dataset, batch_size=batch_size,shuffle=True)

total_samples = len(dataset)
n_iterations  = math.ceil(total_samples/batch_size)
train_loop(model, epochs=5, lr=0.01)
epoch : 1,training loss : 0.7008593678474426
epoch : 1,training loss : 0.6940442323684692
epoch : 1,training loss : 0.6933980584144592
epoch : 1,training loss : 0.6913265585899353
epoch : 1,training loss : 0.6920694708824158
epoch : 1,training loss : 0.6917922496795654
epoch : 1,training loss : 0.6916401386260986
epoch : 1,training loss : 0.6912254691123962
epoch : 1,training loss : 0.6912662982940674
epoch : 1,training loss : 0.6917388439178467
epoch : 2,training loss : 0.6924116015434265
epoch : 2,training loss : 0.6922740340232849
epoch : 2,training loss : 0.6921844482421875
epoch : 2,training loss : 0.6925062537193298
epoch : 2,training loss : 0.6929883360862732
epoch : 2,training loss : 0.6928806900978088
epoch : 2,training loss : 0.6929706335067749
epoch : 2,training loss : 0.69304358959198
epoch : 2,training loss : 0.6931059956550598
epoch : 2,training loss : 0.693105936050415
epoch : 3,training loss : 0.6940449476242065
epoch : 3,training loss : 0.6944928169250488
epoch : 3,training loss : 0.6931028366088867
epoch : 3,training loss : 0.693550169467926
epoch : 3,training loss : 0.6934279203414917
epoch : 3,training loss : 0.693369448184967
epoch : 3,training loss : 0.6930176019668579
epoch : 3,training loss : 0.6930411458015442
epoch : 3,training loss : 0.6931909918785095
epoch : 3,training loss : 0.6931660771369934
epoch : 4,training loss : 0.6931740045547485
epoch : 4,training loss : 0.6933149099349976
epoch : 4,training loss : 0.6931800842285156
epoch : 4,training loss : 0.6932072639465332
epoch : 4,training loss : 0.6931265592575073
epoch : 4,training loss : 0.6930846571922302
epoch : 4,training loss : 0.6930670142173767
epoch : 4,training loss : 0.6931456923484802
epoch : 4,training loss : 0.6934738159179688
epoch : 4,training loss : 0.693654477596283
epoch : 5,training loss : 0.6934962272644043
epoch : 5,training loss : 0.6931811571121216
epoch : 5,training loss : 0.693188488483429
epoch : 5,training loss : 0.6933542490005493
epoch : 5,training loss : 0.6932279467582703
epoch : 5,training loss : 0.6931856274604797
epoch : 5,training loss : 0.6932218670845032
epoch : 5,training loss : 0.6937268972396851
epoch : 5,training loss : 0.6930850148200989
epoch : 5,training loss : 0.6932364106178284

Both NN (Deep and Shallow) has same batch size , epochs and optimizer with its parameters, why deep network does not have gradient vanishing phenomena for which i can see increase training loss? What i am missing here please give some idea

A deeper network does not always imply that you will face gradient vanishing. It may also depend on your data and other factors.
Note you are also using batch normalisation which works in preventing gradient vanishing and explosion.

1 Like

It also seems that no model is training properly (both are stuck at the initial loss value), so I wouldn’t try to interpret a lot into the loss values.

1 Like

@ptrblck yes all features are much overlapped for binary classification , there is no specific overlapped region where they are overlapped, class 1 and class 0 both are over lapped with 50/50 ratio in all region for all features. I created density plot of all features for class 1 vs class 0 , they are much overlapped so you cant identify which kde is for class 1 and which one is for class 0. Any idea to handle this situation?