Model weights not updating at all

I am trying to run a depth map super resolution model on Google Colab.
However, when I run the model, none of my weights update at all, despite the loss being computed properly and all of the layers getting gradients.

I have compared each of the model’s named parameters before and after the optimizer.step() function, and all of them return True.

What is happening? What should I try first to fix this?

Part of my code looks like this:

def train(train_loader, val_loader, optimizer, scheduler, model, criterion, last_step, max_steps, model_out_path):
for param_group in optimizer.param_groups:
    param_group["lr"] = learning_rate
model.train()
current_step = last_step + 1

train_interval = 50
val_interval = 250
save_interval = 2000

# optimization run
while True:
    for batch in train_loader:
        #optimizer.zero_grad()
        if current_step > max_steps:
            break

        if current_step % train_interval == 0:
            print("step ", current_step, " of ", int(max_steps))

        hYh, hDl, hDh = batch[0], batch[1], batch[2]
        hYh = hYh.float()
        hDl = hDl.float()
        hDh = hDh.float()

        if cuda:
            hYh = hYh.cuda()
            hDl = hDl.cuda()
            hDh = hDh.cuda()

        loss = criterion(model(hYh, hDl), hDh)    # now has multi scale # and forgot about indexing from the hapazard old implementation
        #optimizer.zero_grad()
        a = list(model.parameters())[0].clone()
        #optimizer.zero_grad()
        loss.backward()
        if current_step % train_interval == 0:
            print('loss: ', loss)
        '''
        if current_step % train_interval == 0:
            for param in model.parameters():
                print(param.grad.data.sum())
        '''
        optimizer.step()
        scheduler.step()    # learning rate decay set
        optimizer.zero_grad()

        b = list(model.parameters())[0].clone()
        '''
        if current_step % train_interval == 0:
            for i in range(0, len(a)):
                print(torch.equal(a[i].data, b[i].data))
        '''

        if current_step % val_interval == 0:
            print("===> Step[{}]: Loss: {}".format(current_step, loss.data))
            writer.add_scalar("Training Loss", loss.data, current_step)
            validate(val_loader, optimizer, model, criterion, current_step, save_interval)

        if current_step % save_interval == 0:
            save_checkpoint(model, optimizer, scheduler, current_step, model_out_path)

        current_step += 1

This shouldn’t happen if you compare the right values and not only their references.
Could you post an executable code snippet using random tensors, which would reproduce this issue so that we could have a loop at it, please?

How can I feed random tensors into this?

You could check the shapes of the input tensors and create new random tensors using this shape via x = torch.randn(shape).

So does this mean I should feed a random tensor and then use it to run one update of step() to see what happens?

Yes, you could do this and post the executable code snippet here in case you get stuck.
Your current code is not execuable, i.e. I cannot just copy-paste it, run it, and reproduce the issue, which would be possible if you use random tensors and post the missing parts of the code.

Feeding a random tensor and performing one step worked completely fine.

What did not work fine is when I iterated over that random tensor multiple times, at which point the parameters eventually stopped updating altogether after a few dozen steps despite the gradient not being zero.

Here is my network definition:

class FeatureExtractD(nn.Module):

def __init__(self):
    super(FeatureExtractD, self).__init__()
    self.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=1, bias=True)
    self.conv2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1, bias=True)
    self.conv3 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding=1, bias=True)
    self.prelu1 = nn.PReLU(num_parameters=64, init=0.25)
    self.prelu2 = nn.PReLU(num_parameters=64, init=0.25)
    self.prelu3 = nn.PReLU(num_parameters=32, init=0.25)

def forward(self, x):
    out = self.conv1(x)
    out = self.prelu1(out)
    #print(out.shape)
    out = self.conv2(out)
    out = self.prelu2(out)
    #print(out.shape)
    out = self.conv3(out)
    out = self.prelu3(out)
    #print(out.shape)
    return out

class UpBlock(nn.Module):
def init(self):
super(UpBlock, self).init()
self.conv1 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1, bias=True)
self.conv2 = nn.Conv2d(in_channels=32, out_channels=128, kernel_size=3, stride=1, padding=1, bias=True)
self.ps = nn.PixelShuffle(upscale_factor=2)
self.relu = nn.ReLU()

def forward(self, x):
    up = self.conv1(x)
    up = self.relu(up)
    up = self.conv2(up)
    up = self.ps(up)
    return up

class MSNet4(nn.Module):
def init(self):
super(MSNet4, self).init()
self.feat = FeatureExtractD()
self.up1 = UpBlock()
self.up2 = UpBlock()
self.recon = nn.Conv2d(in_channels=32, out_channels=1, kernel_size=3, stride=1, padding=1, bias=True)

def forward(self, d):
    f = self.feat(d)
    f = self.up1(f)
    f = self.up2(f)
    f = self.recon(f)
    return f

And here is the code used for random training:

device = 'cuda' if torch.cuda.is_available() else 'cpu' print('Using {} device'.format(device))

cuda = True
learning_rate = 1e-4

if cuda:
print("=> use gpu id: ‘{}’".format(gpus))
os.environ[“CUDA_VISIBLE_DEVICES”] = gpus
if not torch.cuda.is_available():
raise Exception(“No GPU found or Wrong GPU ID, please run without --cuda”)

seed = random.randint(1, 10000)
print(“Random Seed: {}”.format(seed))
torch.manual_seed(seed)
if cuda:
torch.cuda.manual_seed(seed)

torch.autograd.set_detect_anomaly(True)
cudnn.benchmark = True

net = MSNet4() # 4x version
print(’# network parameters: ', sum(param.numel() for param in net.parameters()))

#criterion = nn.L1Loss()
criterion = nn.MSELoss()

if torch.cuda.is_available():
net.cuda()

print(‘setting optimizer…’)
optimizer = optim.Adam(net.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08)
#optimizer = optim.SGD(params=net.parameters(), lr=learning_rate, momentum=0.9)
print(‘setting scheduler…’)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.8)
print_params(net)

print(‘Training Start!’)
for param_group in optimizer.param_groups:
param_group[“lr”] = learning_rate

net.train()

Dh_rand = torch.rand([128, 1, 64, 64])
Dl_rand = torch.rand([128, 1, 16, 16])

if cuda:
Dl_rand = Dl_rand.cuda()
Dh_rand = Dh_rand.cuda()

current_step = 0
limit = 1000
train_interval = 50

while True:
if current_step > limit:
break

loss = criterion(net(Dl_rand), Dh_rand)    # random tensor training
a = list(net.parameters())[0].clone()
loss.backward()

optimizer.step()
scheduler.step()    # learning rate decay set

if current_step % train_interval == 0:
    print('Step [{}] loss: {}'.format(current_step, loss))
    print('Printing gradient sums for params...')
    for param in net.parameters():
        print(param.grad.data.sum().data)

optimizer.zero_grad()

b = list(net.parameters())[0].clone()

if current_step % train_interval == 0:
    print('Printing comparisons of weights before and after update; True means weight is not updating')
    for i in range(0, len(a)):
        print(torch.equal(a[i].data, b[i].data), end=', ')
    print('')
    print('comparison done.')

current_step += 1

Never mind, I changed the code to compare the named parameter values properly and they all update perfectly fine. So I guess I misidentified the actual problem all this time.

Guess this is another learning experience for me…