I have observed what I regard as strange/undesirable behavior. This is observed in python 3.7.4; I have not verified on other versions.
In essence, simply adding an nn.Linear module which is not used anywhere in the network changes the behavior of SGD and Adam.
Here’s a simple example showing the behavior:
import torch
import torch.utils.data
from torch import nn, optim
import numpy as np
import torch.utils.data as data_utils
from torch.nn.parameter import Parameter
torch.manual_seed(0)
torch.cuda.manual_seed(0)
np.random.seed(0)
class test(nn.Module):
def __init__(self):
super(test, self).__init__()
self.a = Parameter(torch.zeros(1,1),requires_grad=True)
# self.nuisance = torch.nn.Linear(10,10,bias=True)
def forward(self, x):
return self.a*x
def loss_function(self, x):
return torch.sum(self.forward(x))
def get_loader():
x_train = torch.rand(1024,1)
y_train = torch.rand(1024,1)
train = data_utils.TensorDataset(x_train, y_train)
train_loader = data_utils.DataLoader(train, batch_size=128, shuffle=True)
return train_loader
device = "cpu"
model = test().to(device)
train_loader = get_loader()
optimizer = optim.SGD(model.parameters(),lr=0.0001)
def train():
# set the the model to train mode
model.train()
train_loss = 0
for batch_idx, (data, _) in enumerate(train_loader):
data = data.to(device)
optimizer.zero_grad()
loss_total = model.loss_function(data)
loss_total.backward()
optimizer.step()
train_loss += loss_total.item()
print(loss_total.item() / len(data) )
train_loss /= len(train_loader.dataset)
return train_loss
if __name__ == "__main__":
train()
Commenting/uncommenting the “nuisance” Linear module results in a different printout. With “nuisance” commented out:
0.0
-0.002867907052859664
-0.006473415531218052
-0.008921311236917973
-0.01330609805881977
-0.01546912919729948
-0.019429275766015053
-0.023011282086372375
while with “nuisance” added to the network:
0.0
-0.003120225388556719
-0.006216045003384352
-0.008216088637709618
-0.01179362740367651
-0.01729675382375717
-0.020048711448907852
-0.022001465782523155
In my opinion, this is highly undesirable. In my original (much more complex) example, this behavior resulted in markedly different values for the optimized loss function.
What is happening here?