Loss is always a constant. Not sure why the loss does not decrease. Its always equal to the cross entropy loss for random labels for 10 classes of CIFAR-10 . Loss is constant at each epoch and is equal to -log(1/10) = 2.30
I suspect if the backprop is working or not. Not sure how to debug it
def initialize(wfp):
wtilde=wfp/torch.std(wfp)
sigma_a=0.95-((0.95-0.05)*torch.abs(wtilde))
sigma_b=0.5*(1+(wfp/(1-sigma_a)))
sigma_a=torch.clamp(sigma_a,0.05,0.95)
sigma_b=torch.clamp(sigma_b,0.05,0.95)
a=torch.log(sigma_a/(1-sigma_a)).requires_grad_().cuda()
b=torch.log(sigma_b/(1-sigma_b)).requires_grad_().cuda()
return a,b
w1fpconv=convlayer1param()
w2fpconv=convlayer2param()
w3fpconv=convlayer3param()
w4fpconv=convlayer4param()
w5fpconv=convlayer5param()
w6fpconv=convlayer6param()
wfp1=model['layer4.1.weight']
wfp2=model['layer4.4.weight']
al1,bl1=initialize(w1fpconv)
al2,bl2=initialize(w2fpconv)
al3,bl3=initialize(w3fpconv)
al4,bl4=initialize(w4fpconv)
al5,bl5=initialize(w5fpconv)
al6,bl6=initialize(w6fpconv)
a1,b1=initialize(wfp1)
a2,b2=initialize(wfp2)
al1=torch.nn.Parameter(al1)
bl1=torch.nn.Parameter(bl1)
al2=torch.nn.Parameter(al2)
bl2=torch.nn.Parameter(bl2)
al3=torch.nn.Parameter(al3)
bl3=torch.nn.Parameter(bl3)
al4=torch.nn.Parameter(al4)
bl4=torch.nn.Parameter(bl4)
al5=torch.nn.Parameter(al5)
bl5=torch.nn.Parameter(bl5)
al6=torch.nn.Parameter(al6)
bl6=torch.nn.Parameter(bl6)
a1=torch.nn.Parameter(a1)
b1=torch.nn.Parameter(b1)
a2=torch.nn.Parameter(a2)
b2=torch.nn.Parameter(b2)
betaparam=1e-11
lossfunc=torch.nn.CrossEntropyLoss().to(device)
lr=0.01
optimizer=torch.optim.Adam([al1,bl1,al2,bl2,al3,bl3,al4,bl4,al5,bl5,al6,bl6,a1,b1,a2,b2],lr,weight_decay=5e-4)
num_epochs=10
for epoch in range(num_epochs):
for i,(images,labels) in enumerate(train_loader):
images=images.to(device)
labels=labels.to(device)
y1=reparamcnn1(al1,bl1,images)
y2=reparamcnn2(al2,bl2,y1)
y3=reparamcnn3(al3,bl3,y2)
y4=reparamcnn4(al4,bl4,y3)
y5=reparamcnn5(al5,bl5,y4)
y6=reparamcnn6(al6,bl6,y5)
y6=y6.reshape(y6.size(0),-1)
y6=torch.t(y6)
y7=F.dropout(y6)
y8=reparamfc(a1,b1,y7)
y9=F.relu(y8)
y10=F.dropout(y9)
yout=reparamfc(a2,b2,y10)
yout=torch.t(yout)
#yout=F.softmax(yout,dim=1)
l2=al1.norm(2)+bl1.norm(2)+al2.norm(2)+bl2.norm(2)+al3.norm(2)+bl3.norm(2)+al4.norm(2)+bl4.norm(2)+al5.norm(2)+bl5.norm(2)+al6.norm(2)+bl6.norm(2)+a1.norm(2)+b1.norm(2)+a2.norm(2)+b2.norm(2)
lossi=lossfunc(yout,labels)+(betaparam*l2)
if(epoch==170):
lr=0.001
for param_group in optimizer.param_groups:
param_group['lr']=lr
lossi.backward()
optimizer.step()
optimizer.zero_grad()
print('epoch {}'.format(epoch),'loss = {}'.format(lossi.item()))
The result is this
epoch 0 loss = 2.305433988571167
epoch 1 loss = 2.3047266006469727
epoch 2 loss = 2.2993619441986084
epoch 3 loss = 2.305569887161255
epoch 4 loss = 2.303546667098999
epoch 5 loss = 2.2977681159973145
epoch 6 loss = 2.2988994121551514
epoch 7 loss = 2.305543899536133
epoch 8 loss = 2.304884672164917
epoch 9 loss = 2.3079733848571777
epoch 10 loss = 2.2997756004333496
epoch 11 loss = 2.2982029914855957
epoch 12 loss = 2.3063526153564453
epoch 13 loss = 2.3051438331604004
epoch 14 loss = 2.299895763397217
epoch 15 loss = 2.2976086139678955
epoch 16 loss = 2.303872585296631
epoch 17 loss = 2.304962635040283
epoch 18 loss = 2.292499303817749
epoch 19 loss = 2.3069281578063965
epoch 20 loss = 2.3034133911132812
epoch 21 loss = 2.3061203956604004
epoch 22 loss = 2.3057847023010254
epoch 23 loss = 2.3092713356018066
epoch 24 loss = 2.3067853450775146
epoch 25 loss = 2.3024075031280518
epoch 26 loss = 2.306104898452759
epoch 27 loss = 2.3030776977539062
epoch 28 loss = 2.302023410797119
epoch 29 loss = 2.304934024810791
epoch 30 loss = 2.3043360710144043
epoch 31 loss = 2.303095579147339
epoch 32 loss = 2.304739475250244
epoch 33 loss = 2.305116891860962
epoch 34 loss = 2.305945873260498
the reparamcnn functions basically do the forward propagation across the layers of the neural network