I’m reimplementing a small convolutional network which is trained with the SGD optimiser, and will need to make use of the weight_norm functionality. However when I set up weight_norm and looked at how the network weights change, it looks a bit odd - if I look at the weight_v and weight_g values for each layer while training, the weight_v values change in all the layers, but the weight_g values only change in the last layer in the network. Is this to be expected?
When I change the optimiser to the Adam optimiser, the weight_v and weight_g values change in all the layers while training, as I had expected.
Here is a small example to demonstrate what I’m seeing, in case my code has an error:
import torch.nn as nn
import torch.optim as optim
import torch
from torch.utils.data import Dataset, DataLoader
class RandomDataset(Dataset):
def __init__(self, length):
self.len = length
self.data = torch.randint(0, 2, (length, 4, 600))
self.labels = torch.randint(0, 2, (length, 100))
def __getitem__(self, index):
return self.data[index], self.labels[index]
def __len__(self):
return self.len
class View(nn.Module):
def __init__(self):
super(View, self).__init__()
def forward(self, x):
return x.view(x.size()[0], -1)
def init_weights(m):
if isinstance(m, nn.Linear) or isinstance(m, nn.Conv1d):
m = nn.utils.weight_norm(m)
def get_model():
model = nn.Sequential(
nn.Conv1d(4, 300, 19),
nn.BatchNorm1d(300),
nn.ReLU(),
nn.MaxPool1d(3),
nn.Conv1d(300, 200, 11),
nn.BatchNorm1d(200),
nn.ReLU(),
nn.MaxPool1d(4),
View(),
nn.Linear(9200, 100),
nn.Sigmoid(),
)
return model
def train_model():
net = get_model()
net.apply(init_weights)
for m in net.modules():
if isinstance(m, nn.modules.conv.Conv1d) or isinstance(m, nn.modules.Linear):
print("Magnitude: Mean: {:.4f} Max: {:.4f} Min: {:.4f}".format(m.weight_g.mean(), m.weight_g.max(), m.weight_g.min()))
print("Direction: Mean: {:.4f} Max: {:.4f} Min: {:.4f}".format(m.weight_v.mean(), m.weight_v.max(), m.weight_v.min()))
if torch.cuda.device_count() > 1:
print("Using {} GPUs".format(torch.cuda.device_count()))
net = nn.DataParallel(net)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
net.to(device)
criterion = nn.BCELoss() # binary cross-entropy loss
optimiser = optim.SGD(net.parameters(), lr=0.002, momentum=0.98)
trainloader = DataLoader(dataset=RandomDataset(1000),
batch_size=64, shuffle=True, num_workers=10)
for epoch in range(5): # loop over the dataset multiple times
print("Epoch {}".format(epoch+1))
train_loop(trainloader, device, optimiser, criterion, net)
layer = 0
for m in net.modules():
layer+=1
if isinstance(m, nn.modules.conv.Conv1d) or isinstance(m, nn.modules.Linear):
print("Layer: {}".format(layer))
print("Magnitude: Mean: {:.4f} Max: {:.4f} Min: {:.4f}".format(m.weight_g.mean(), m.weight_g.max(), m.weight_g.min()))
print("Direction: Mean: {:.4f} Max: {:.4f} Min: {:.4f}".format(m.weight_v.mean(), m.weight_v.max(), m.weight_v.min()))
print('Finished Training')
def train_loop(trainloader, device, optimiser, criterion, net):
net.train()
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs = inputs.float().to(device)
labels = labels.float().to(device)
# zero the parameter gradients
optimiser.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimiser.step()
del (loss)
del (inputs)
del (labels)
del (outputs)
torch.cuda.empty_cache()
# Running the model
if __name__ == '__main__':
train_model()
The output looks like this:
Magnitude: Mean: 0.5769 Max: 0.6556 Min: 0.4752
Direction: Mean: -0.0003 Max: 0.1147 Min: -0.1147
Magnitude: Mean: 0.5774 Max: 0.5873 Min: 0.5672
Direction: Mean: -0.0000 Max: 0.0174 Min: -0.0174
Magnitude: Mean: 0.5770 Max: 0.5835 Min: 0.5685
Direction: Mean: -0.0000 Max: 0.0104 Min: -0.0104
cpu
Epoch 1
Layer: 2
Magnitude: Mean: 0.5769 Max: 0.6556 Min: 0.4752
Direction: Mean: -0.0003 Max: 0.1147 Min: -0.1147
Layer: 6
Magnitude: Mean: 0.5774 Max: 0.5873 Min: 0.5672
Direction: Mean: -0.0000 Max: 0.0175 Min: -0.0175
Layer: 11
Magnitude: Mean: 0.5767 Max: 0.5834 Min: 0.5677
Direction: Mean: 0.0000 Max: 0.0108 Min: -0.0107
<snip>
Epoch 5
Layer: 2
Magnitude: Mean: 0.5769 Max: 0.6556 Min: 0.4752
Direction: Mean: -0.0003 Max: 0.1158 Min: -0.1156
Layer: 6
Magnitude: Mean: 0.5774 Max: 0.5873 Min: 0.5672
Direction: Mean: -0.0000 Max: 0.0190 Min: -0.0188
Layer: 11
Magnitude: Mean: 0.5737 Max: 0.5822 Min: 0.5608
Direction: Mean: -0.0000 Max: 0.0122 Min: -0.0120