Optimizer Doesn't update Weights

Can someone help me understand why the weights are not updating?

unet = Unet()
optimizer = torch.optim.Adam(unet.parameters(), lr=0.1)
loss_fn = torch.nn.BCELoss()
input =  torch.randn(32, 1, 64, 64, 64 , requires_grad=True)
target = torch.randn(32, 1, 64, 64, 64,  requires_grad=False)
optimizer.zero_grad()
y_pred = unet(input)
y = target[: , : , 20:44, 20:44, 20:44]
loss = loss_fn(y_pred, y)
print(unet.conv1.weight.data)
loss.backward()
optimizer.step()
print(unet.conv1.weight.data)



The init of the model is define like this :

class Unet(nn.Module):

    def __init__(self):
      super(Unet, self).__init__()

      # Down hill1
      self.conv1 = nn.Conv3d(1, 2, kernel_size=3,  stride=1)
      self.conv2 = nn.Conv3d(2, 2, kernel_size=3,  stride=1)

      # Down hill2
      self.conv3 = nn.Conv3d(2, 4, kernel_size=3,  stride=1)
      self.conv4 = nn.Conv3d(4, 4, kernel_size=3,  stride=1)

      #bottom
      self.convbottom1 = nn.Conv3d(4, 8, kernel_size=3,  stride=1)
      self.convbottom2 = nn.Conv3d(8, 8, kernel_size=3,  stride=1)

      #up hill1
      self.upConv0 = nn.Conv3d(8, 4, kernel_size=3,  stride=1)
      self.upConv1 = nn.Conv3d(4, 4, kernel_size=3,  stride=1)
      self.upConv2 = nn.Conv3d(4, 2, kernel_size=3,  stride=1)

      #up hill2
      self.upConv3 = nn.Conv3d(2, 2, kernel_size=3, stride=1)
      self.upConv4 = nn.Conv3d(2, 1, kernel_size=1, stride=1)

      self.mp = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)

      self.output1 = 0
      self.output2 = 0
      self.output3 = 0
      self.output4 = 0
      self.output5 = 0
      self.output6 = 0
      self.output7 = 0
      self.output8 = 0
      self.output9 = 0
      self.output10 = 0
      self.output11 = 0
      self.output12 = 0
      self.output13 = 0
      self.output14 = 0

The forward pass follows an approach of:

def forward(self, input):
       def forward(self, input):
       # Use U-net Theory to Update the filters.
       # Example Approach...
       self.output1 = F.relu(self.conv1(input))
       self.output2 = F.relu(self.conv2(self.output1))

       self.output3 = self.mp(self.output2)

       self.output4 = F.relu(self.conv3(self.output3))
       self.output5 = F.relu(self.conv4(self.output4))

       self.output6 = self.mp(self.output5)

       self.output7 = F.relu(self.convbottom1(self.output6))
       self.output8 = F.relu(self.convbottom2(self.output7))

       self.output9 = F.interpolate(self.output8, scale_factor=2, mode='trilinear')

       self.output10 = F.relu(self.upConv0(self.output9))
       self.output11 = F.relu(self.upConv1(self.output10))

       self.output12 = F.interpolate(self.output11, scale_factor=2, mode='trilinear')


       self.output13 = F.relu(self.upConv2(self.output12))
       self.output14 = F.relu(self.upConv3(self.output13))

       return F.relu(self.upConv4(self.output14))

I have found out that y_pred.grad is none. Then i can understand why the optimizer is not moving the weights in any direction. What i can’t grasp now is why are the gradient none? And why is the optimizer not producing an error?

Besides some minor issues, your code looks fine.
Could you post the complete model definition so that I could take a look at it?

The minor problems are:

  • Call the model directly for your forward pass, i.e. use model(input) instead of model.forward(input) as the former call will properly register all hooks etc.
  • Variables are deprecated sind PyTorch 0.4.0. Just remove the Variable wrapping in your code.
  • Your comparison will always return True, since before holds a reference to weight’s data. Use unet.conv4.weight.data.clone() to deep copy the data.
1 Like

I have edited my post and supplied you with all the code you need to run it yourself. At the moment the weights change like 2 out of 10 times you run the code :open_mouth:
So for far the majority of times the weights never change on my end.

Btw this turned out to be a problem of dead ReLu’s. switching to LeakReLu or something else got the weight to behave little more like expected but the network would eventually die. Upon switching from Adam to SGD optimizer the network never dies anymore and learns. Not sure why that did the trick.

1 Like

Awesome to hear you’ve debugged this issue! :slight_smile:

Hi Ptrblck,

I add the second loss to the first loss and expect that the gradients, weights and result changes. But there is no change and difference with the time I just use one loss function. The fist one isBCELoss and the second one is L1.
I check the gradients in both case with loss1 and with loss1+loss2 but gradients were same. exact same. adding more loss does not have effect on gradient even if I used loss1+10*loss2

netG = Generator994(ngpu,nz,ngf).to(device)

optimizerG = optim.Adam(netG.parameters(), lr=lr2, betas=(beta1, 0.999))

netG.zero_grad()

label.fill_(real_label)  
label=label.to(device)
output = netD(fake).view(-1)
# Calculate G's loss based on this output
loss1 = criterion(output, label)


xxx=torch.histc(Gaussy.squeeze(1).view(-1).cpu(),100, min=0, max=1, out=None)
ddGaussy=xxx/xxx.sum()

xxx1=torch.histc(fake.squeeze(1).view(-1).cpu(),100, min=0, max=1, out=None)
ddFake=xxx1/xxx1.sum()

loss2=abs(ddGaussy-ddFake).sum()

# Calculate gradients for G with 2 loss

errG=loss1+loss2
errG.backward()

for param in netG.parameters():
            print(param.grad.data.sum())
# Update G
optimizerG.step()
 
## ------------------
class Generator994(nn.Module):
    def __init__(self,ngpu,nz,ngf):
        super(Generator994, self).__init__()
        self.ngpu=ngpu
        self.nz=nz
        self.ngf=ngf
        self.l1= nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d(self.nz, self.ngf * 8, 3, 1, 0, bias=False),
            nn.BatchNorm2d(self.ngf * 8),
            nn.ReLU(True),)
            # state size. (ngf*8) x 4 x 4
        self.l2=nn.Sequential(nn.ConvTranspose2d(self.ngf * 8, self.ngf * 4, 3, 1, 0, bias=False),
            nn.BatchNorm2d(self.ngf * 4),
            nn.ReLU(True),)
            # state size. (ngf*4) x 8 x 8
        self.l3=nn.Sequential(nn.ConvTranspose2d( self.ngf * 4, self.ngf * 2, 3, 1, 0, bias=False),
            nn.BatchNorm2d(self.ngf * 2),
            nn.ReLU(True),)
            # state size. (ngf*2) x 16 x 16
        self.l4=nn.Sequential(nn.ConvTranspose2d( self.ngf*2, 1, 3, 1, 0, bias=False),nn.Sigmoid()
#            nn.Tanh()
            # state size. (nc) x 64 x 64
        )

    def forward(self, input):
        out=self.l1(input)
        out=self.l2(out)
        out=self.l3(out)
        out=self.l4(out)
        print(out.shape)
        return out

Double post with answer from here.