I read many documents and discussion forum that deep network is more prone to weight vanishing than shallow network. When i try to check loss between deep network and shallow network for the same data , both has more or less same loss for fist couple of epochs. Why it so , why gradient vanishing is not happening in deep network which cause to increase in loss
Deep Network
class model(nn.Module):
def __init__(self):
super().__init__()
self.lin1 = nn.Linear(118,100)
self.lin2 = nn.Linear(100,90)
self.lin3 = nn.Linear(90,85)
self.lin4 = nn.Linear(85,70)
self.lin5 = nn.Linear(70,65)
self.lin6 = nn.Linear(65,60)
self.lin7 = nn.Linear(60,55)
self.lin8 = nn.Linear(55,50)
self.lin9 = nn.Linear(50,45)
self.lin10 = nn.Linear(45,40)
self.lin11 = nn.Linear(40,35)
self.lin12 = nn.Linear(35,30)
self.lin13 = nn.Linear(30,25)
self.lin14 = nn.Linear(25,20)
self.lin15 = nn.Linear(20,18)
self.lin16 = nn.Linear(18,16)
self.lin17 = nn.Linear(16,12)
self.lin18 = nn.Linear(12,10)
self.lin19 = nn.Linear(10,8)
self.lin20 = nn.Linear(8,6)
self.lin21 = nn.Linear(6,4)
self.lin22 = nn.Linear(4,2)
self.bn1 = nn.BatchNorm1d(100)
self.bn2 = nn.BatchNorm1d(90)
self.bn3 = nn.BatchNorm1d(85)
self.bn4 = nn.BatchNorm1d(70)
self.bn5 = nn.BatchNorm1d(65)
self.bn6 = nn.BatchNorm1d(60)
self.bn7 = nn.BatchNorm1d(55)
self.bn8 = nn.BatchNorm1d(50)
self.bn9 = nn.BatchNorm1d(45)
self.bn10 = nn.BatchNorm1d(40)
self.bn11 = nn.BatchNorm1d(35)
self.bn12 = nn.BatchNorm1d(30)
self.bn13 = nn.BatchNorm1d(25)
self.bn14 = nn.BatchNorm1d(20)
self.bn15 = nn.BatchNorm1d(18)
self.bn16 = nn.BatchNorm1d(16)
self.bn17 = nn.BatchNorm1d(12)
self.bn18 = nn.BatchNorm1d(10)
self.bn19 = nn.BatchNorm1d(8)
self.bn20 = nn.BatchNorm1d(6)
self.bn21 = nn.BatchNorm1d(4)
def forward(self, x):
x = self.lin1(x)
x = self.bn1(x)
x = F.sigmoid(x)
#x = F.relu(torch.cat([x,res],1))
x = self.lin2(x)
x = self.bn2(x)
x = F.sigmoid(x)
x = self.lin3(x)
x = self.bn3(x)
x = F.sigmoid(x)
x = self.lin4(x)
x = self.bn4(x)
x = F.sigmoid(x)
x = self.lin5(x)
x = self.bn5(x)
x = F.sigmoid(x)
x = self.lin6(x)
x = self.bn6(x)
x = F.sigmoid(x)
x = self.lin7(x)
x = self.bn7(x)
x = F.sigmoid(x)
x = self.lin8(x)
x = self.bn8(x)
x = F.sigmoid(x)
x = self.lin9(x)
x = self.bn9(x)
x = F.relu(x)
x = self.lin10(x)
x = self.bn10(x)
x = F.sigmoid(x)
x = self.lin11(x)
x = self.bn11(x)
x = F.sigmoid(x)
x = self.lin12(x)
x = self.bn12(x)
x = F.sigmoid(x)
x = self.lin13(x)
x = self.bn13(x)
x = F.sigmoid(x)
x = self.lin14(x)
x = self.bn14(x)
x = F.sigmoid(x)
x = self.lin15(x)
x = self.bn15(x)
x = F.sigmoid(x)
x = self.lin16(x)
x = self.bn16(x)
x = F.sigmoid(x)
x = self.lin17(x)
x = self.bn17(x)
x = F.sigmoid(x)
x = self.lin18(x)
x = self.bn18(x)
x = F.sigmoid(x)
x = self.lin19(x)
x = self.bn19(x)
x = F.sigmoid(x)
x = self.lin20(x)
x = self.bn20(x)
x = F.sigmoid(x)
x = self.lin21(x)
x = self.bn21(x)
x = F.sigmoid(x)
x = self.lin22(x)
return x
Optimizer
def get_optimizer(model, lr):
optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.01) # 1e-2 , 0.01,0.02,0.03,0.04
my_lr_scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=1000, gamma=0.1)
return optim,my_lr_scheduler
Training Model
def train_loop(model, epochs, lr):
sum_loss = 0
output = 0
criterion = nn.CrossEntropyLoss()
optim,my_lr_scheduler = get_optimizer(model, lr = lr)
for epoch in range(epochs):
preds = []
for i, (inputs,labels) in enumerate(dataloader):
# forward backward , update wieghts
model.train()
batch = labels.shape[0]
output = model(inputs)
_,pred = torch.max(output,1)
loss = criterion(output, labels)
optim.zero_grad()
loss.backward()
optim.step()
sum_loss += batch*(loss.item())
if (i+1) % 10 ==0:
print(f'epoch : {epoch+1},training loss : {loss}')
batch_size = 9000
dataset= Dataset(X,y)
dataloader= DataLoader(dataset=dataset, batch_size=batch_size,shuffle=True)
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/batch_size)
train_loop(model, epochs=5, lr=0.01)
epoch : 1,training loss : 0.6930996775627136
epoch : 1,training loss : 0.6933792233467102
epoch : 1,training loss : 0.6929088830947876
epoch : 1,training loss : 0.6930387616157532
epoch : 1,training loss : 0.6932080984115601
epoch : 1,training loss : 0.6932775378227234
epoch : 1,training loss : 0.6931776404380798
epoch : 1,training loss : 0.6929720640182495
epoch : 1,training loss : 0.6922922134399414
epoch : 1,training loss : 0.6927770972251892
epoch : 2,training loss : 0.6929906606674194
epoch : 2,training loss : 0.6931278705596924
epoch : 2,training loss : 0.6935908794403076
epoch : 2,training loss : 0.6931191682815552
epoch : 2,training loss : 0.6931272745132446
epoch : 2,training loss : 0.6933028697967529
epoch : 2,training loss : 0.6931471228599548
epoch : 2,training loss : 0.6932592988014221
epoch : 2,training loss : 0.6932491064071655
epoch : 2,training loss : 0.6931505799293518
epoch : 3,training loss : 0.6932841539382935
epoch : 3,training loss : 0.6931620240211487
epoch : 3,training loss : 0.693145215511322
epoch : 3,training loss : 0.6931430101394653
epoch : 3,training loss : 0.693149745464325
epoch : 3,training loss : 0.6933196187019348
epoch : 3,training loss : 0.6931200623512268
epoch : 3,training loss : 0.69309401512146
epoch : 3,training loss : 0.693195104598999
epoch : 3,training loss : 0.6931386590003967
epoch : 4,training loss : 0.6932438611984253
epoch : 4,training loss : 0.6931557655334473
epoch : 4,training loss : 0.6932560205459595
epoch : 4,training loss : 0.6931642293930054
epoch : 4,training loss : 0.6935083270072937
epoch : 4,training loss : 0.6931833028793335
epoch : 4,training loss : 0.6933455467224121
epoch : 4,training loss : 0.6932011246681213
epoch : 4,training loss : 0.6931713819503784
epoch : 4,training loss : 0.6936143636703491
epoch : 5,training loss : 0.6932041645050049
epoch : 5,training loss : 0.6931434273719788
epoch : 5,training loss : 0.6931551098823547
epoch : 5,training loss : 0.6929137110710144
epoch : 5,training loss : 0.6931952834129333
epoch : 5,training loss : 0.6932055950164795
epoch : 5,training loss : 0.6933173537254333
epoch : 5,training loss : 0.6933751702308655
epoch : 5,training loss : 0.6931929588317871
epoch : 5,training loss : 0.6933628916740417
Shallow Network
class model(nn.Module):
def __init__(self):
super().__init__()
self.lin1 = nn.Linear(118,100)
self.lin2 = nn.Linear(100,40)
self.lin3 = nn.Linear(40,2)
self.bn1 = nn.BatchNorm1d(100)
self.bn2 = nn.BatchNorm1d(40)
def forward(self, x):
x = self.lin1(x)
x = self.bn1(x)
x = F.sigmoid(x)
x = self.lin2(x)
x = self.bn2(x)
x = F.sigmoid(x)
x = self.lin3(x)
return x
model = model()
to_device(model, device)
Training
batch_size = 9000
dataset= Dataset(X,y)
dataloader= DataLoader(dataset=dataset, batch_size=batch_size,shuffle=True)
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/batch_size)
train_loop(model, epochs=5, lr=0.01)
epoch : 1,training loss : 0.7008593678474426
epoch : 1,training loss : 0.6940442323684692
epoch : 1,training loss : 0.6933980584144592
epoch : 1,training loss : 0.6913265585899353
epoch : 1,training loss : 0.6920694708824158
epoch : 1,training loss : 0.6917922496795654
epoch : 1,training loss : 0.6916401386260986
epoch : 1,training loss : 0.6912254691123962
epoch : 1,training loss : 0.6912662982940674
epoch : 1,training loss : 0.6917388439178467
epoch : 2,training loss : 0.6924116015434265
epoch : 2,training loss : 0.6922740340232849
epoch : 2,training loss : 0.6921844482421875
epoch : 2,training loss : 0.6925062537193298
epoch : 2,training loss : 0.6929883360862732
epoch : 2,training loss : 0.6928806900978088
epoch : 2,training loss : 0.6929706335067749
epoch : 2,training loss : 0.69304358959198
epoch : 2,training loss : 0.6931059956550598
epoch : 2,training loss : 0.693105936050415
epoch : 3,training loss : 0.6940449476242065
epoch : 3,training loss : 0.6944928169250488
epoch : 3,training loss : 0.6931028366088867
epoch : 3,training loss : 0.693550169467926
epoch : 3,training loss : 0.6934279203414917
epoch : 3,training loss : 0.693369448184967
epoch : 3,training loss : 0.6930176019668579
epoch : 3,training loss : 0.6930411458015442
epoch : 3,training loss : 0.6931909918785095
epoch : 3,training loss : 0.6931660771369934
epoch : 4,training loss : 0.6931740045547485
epoch : 4,training loss : 0.6933149099349976
epoch : 4,training loss : 0.6931800842285156
epoch : 4,training loss : 0.6932072639465332
epoch : 4,training loss : 0.6931265592575073
epoch : 4,training loss : 0.6930846571922302
epoch : 4,training loss : 0.6930670142173767
epoch : 4,training loss : 0.6931456923484802
epoch : 4,training loss : 0.6934738159179688
epoch : 4,training loss : 0.693654477596283
epoch : 5,training loss : 0.6934962272644043
epoch : 5,training loss : 0.6931811571121216
epoch : 5,training loss : 0.693188488483429
epoch : 5,training loss : 0.6933542490005493
epoch : 5,training loss : 0.6932279467582703
epoch : 5,training loss : 0.6931856274604797
epoch : 5,training loss : 0.6932218670845032
epoch : 5,training loss : 0.6937268972396851
epoch : 5,training loss : 0.6930850148200989
epoch : 5,training loss : 0.6932364106178284
Both NN (Deep and Shallow) has same batch size , epochs and optimizer with its parameters, why deep network does not have gradient vanishing phenomena for which i can see increase training loss? What i am missing here please give some idea