Using SGD with weight_norm: weight magnitudes only update in last layer

I’m reimplementing a small convolutional network which is trained with the SGD optimiser, and will need to make use of the weight_norm functionality. However when I set up weight_norm and looked at how the network weights change, it looks a bit odd - if I look at the weight_v and weight_g values for each layer while training, the weight_v values change in all the layers, but the weight_g values only change in the last layer in the network. Is this to be expected?

When I change the optimiser to the Adam optimiser, the weight_v and weight_g values change in all the layers while training, as I had expected.

Here is a small example to demonstrate what I’m seeing, in case my code has an error:

import torch.nn as nn
import torch.optim as optim
import torch
from torch.utils.data import Dataset, DataLoader

class RandomDataset(Dataset):

    def __init__(self, length):
        self.len = length
        self.data = torch.randint(0, 2, (length, 4, 600))
        self.labels = torch.randint(0, 2, (length, 100))

    def __getitem__(self, index):
        return self.data[index], self.labels[index]

    def __len__(self):
        return self.len

class View(nn.Module):
    def __init__(self):
        super(View, self).__init__()
    def forward(self, x):
        return x.view(x.size()[0], -1)

def init_weights(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv1d):
        m = nn.utils.weight_norm(m)

def get_model():

    model = nn.Sequential( 
        nn.Conv1d(4, 300, 19),
        nn.BatchNorm1d(300),
        nn.ReLU(),
        nn.MaxPool1d(3),
        nn.Conv1d(300, 200, 11),
        nn.BatchNorm1d(200),
        nn.ReLU(),
        nn.MaxPool1d(4),
        View(),
        nn.Linear(9200, 100),
        nn.Sigmoid(),
    )
    return model

def train_model():

    net = get_model()
    net.apply(init_weights)

    for m in net.modules():
        if isinstance(m, nn.modules.conv.Conv1d) or isinstance(m, nn.modules.Linear):
                print("Magnitude: Mean: {:.4f} Max: {:.4f} Min: {:.4f}".format(m.weight_g.mean(), m.weight_g.max(), m.weight_g.min()))
                print("Direction: Mean: {:.4f} Max: {:.4f} Min: {:.4f}".format(m.weight_v.mean(), m.weight_v.max(), m.weight_v.min()))

    if torch.cuda.device_count() > 1:
        print("Using {} GPUs".format(torch.cuda.device_count()))
        net = nn.DataParallel(net)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)

    net.to(device)

    criterion = nn.BCELoss() # binary cross-entropy loss
    optimiser = optim.SGD(net.parameters(), lr=0.002, momentum=0.98)

    trainloader = DataLoader(dataset=RandomDataset(1000),
                             batch_size=64, shuffle=True, num_workers=10)

    for epoch in range(5):  # loop over the dataset multiple times

        print("Epoch {}".format(epoch+1))
        train_loop(trainloader, device, optimiser, criterion, net)

        layer = 0
        for m in net.modules():
            layer+=1
            if isinstance(m, nn.modules.conv.Conv1d) or isinstance(m, nn.modules.Linear):
                print("Layer: {}".format(layer))
                print("Magnitude: Mean: {:.4f} Max: {:.4f} Min: {:.4f}".format(m.weight_g.mean(), m.weight_g.max(), m.weight_g.min()))
                print("Direction: Mean: {:.4f} Max: {:.4f} Min: {:.4f}".format(m.weight_v.mean(), m.weight_v.max(), m.weight_v.min()))
          
    print('Finished Training')

def train_loop(trainloader, device, optimiser, criterion, net):

    net.train()

    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.float().to(device)
        labels = labels.float().to(device)

        # zero the parameter gradients
        optimiser.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimiser.step()

        del (loss)
        del (inputs)
        del (labels)
        del (outputs)

    torch.cuda.empty_cache()

# Running the model
if __name__ == '__main__':
    train_model()

The output looks like this:

Magnitude: Mean: 0.5769 Max: 0.6556 Min: 0.4752
Direction: Mean: -0.0003 Max: 0.1147 Min: -0.1147
Magnitude: Mean: 0.5774 Max: 0.5873 Min: 0.5672
Direction: Mean: -0.0000 Max: 0.0174 Min: -0.0174
Magnitude: Mean: 0.5770 Max: 0.5835 Min: 0.5685
Direction: Mean: -0.0000 Max: 0.0104 Min: -0.0104
cpu
Epoch 1
Layer: 2
Magnitude: Mean: 0.5769 Max: 0.6556 Min: 0.4752
Direction: Mean: -0.0003 Max: 0.1147 Min: -0.1147
Layer: 6
Magnitude: Mean: 0.5774 Max: 0.5873 Min: 0.5672
Direction: Mean: -0.0000 Max: 0.0175 Min: -0.0175
Layer: 11
Magnitude: Mean: 0.5767 Max: 0.5834 Min: 0.5677
Direction: Mean: 0.0000 Max: 0.0108 Min: -0.0107

<snip>

Epoch 5
Layer: 2
Magnitude: Mean: 0.5769 Max: 0.6556 Min: 0.4752
Direction: Mean: -0.0003 Max: 0.1158 Min: -0.1156
Layer: 6
Magnitude: Mean: 0.5774 Max: 0.5873 Min: 0.5672
Direction: Mean: -0.0000 Max: 0.0190 Min: -0.0188
Layer: 11
Magnitude: Mean: 0.5737 Max: 0.5822 Min: 0.5608
Direction: Mean: -0.0000 Max: 0.0122 Min: -0.0120