I have an autoencoder deep network, and I notice that when I use nn.Sequential
, the generalization performance is better than when I don’t use it (i.e. explicitly pass the inputs through layers). Has anyone else noticed this behavior, or can provide an explanation as to why this is? Does Pytorch handle regularization differently in a sequential block?
There shouldn’t be a difference, if you just reimplemented the sequential model using your custom one.
Did you run the code a few times and checked for small variations in the loss/accuracy?
Would it be possible to post both model codes so that we can have a look?
Thanks for the response. Here is a code snippet; I’ve attached the full code at the end for your reference. I am toggling between using nn.Sequential
and not using it with the variable use_sequential
. I find that when I don’t use the sequential module, my test accuracy is always worse than if I do use it.
class Net(nn.Module):
def __init__(self, hidden_dim, in_dim, use_sequential):
super(Net, self).__init__()
self.use_sequential = use_sequential
self.in_dim = in_dim
self.hidden_dim = hidden_dim
self.sig = nn.Sigmoid()
self.encoder = nn.Sequential(
nn.Linear(in_dim, in_dim),
nn.BatchNorm1d(in_dim),
nn.Linear(in_dim, hidden_dim)
)
self.decoder = nn.Sequential(
nn.Linear(hidden_dim, in_dim),
nn.BatchNorm1d(in_dim),
nn.Linear(in_dim, in_dim)
)
def encode(self, x):
if self.use_sequential:
x = self.encoder(x)
else:
x = self.lin1(x)
x = self.batchnorm(x)
x = self.lin2(x)
return x
def decode(self, x):
if self.use_sequential:
x = self.decoder(x)
else:
x = self.lin3(x)
x = self.batchnorm(x)
x = self.lin4(x)
return x
def forward(self, x):
x = self.encode(x)
x = self.decode(x)
x = self.sig(x) # Sigmoid for BCELoss
return x
Here’s the output of the script, which I have pasted in full at the very end for reference. As you can see, the test accuracy/loss is worse as the models train, even though their train accuracy/loss remain similar (I initialize all weights myself, see full code at the end):
SEQUENTIAL TRAIN, Epoch 0: loss=0.7185, acc=0.51
NONSEQUENTIAL TRAIN, Epoch 0: loss=0.7185, acc=0.51
---> SEQUENTIAL TEST: Epoch 0: loss=0.7240, acc=0.50080
---> NONSEQUENTIAL TEST: Epoch 0: loss=0.7240, acc=0.50080
SEQUENTIAL TRAIN, Epoch 1: loss=0.7192, acc=0.49
NONSEQUENTIAL TRAIN, Epoch 1: loss=0.7192, acc=0.49
---> SEQUENTIAL TEST: Epoch 1: loss=0.7226, acc=0.49920
---> NONSEQUENTIAL TEST: Epoch 1: loss=0.7221, acc=0.49920
SEQUENTIAL TRAIN, Epoch 2: loss=0.7207, acc=0.50
NONSEQUENTIAL TRAIN, Epoch 2: loss=0.7208, acc=0.50
---> SEQUENTIAL TEST: Epoch 2: loss=0.7183, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 2: loss=0.7186, acc=0.49920
SEQUENTIAL TRAIN, Epoch 3: loss=0.7032, acc=0.54
NONSEQUENTIAL TRAIN, Epoch 3: loss=0.7033, acc=0.54
---> SEQUENTIAL TEST: Epoch 3: loss=0.7104, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 3: loss=0.7153, acc=0.49920
SEQUENTIAL TRAIN, Epoch 4: loss=0.7002, acc=0.56
NONSEQUENTIAL TRAIN, Epoch 4: loss=0.7002, acc=0.56
---> SEQUENTIAL TEST: Epoch 4: loss=0.7006, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 4: loss=0.7119, acc=0.49920
SEQUENTIAL TRAIN, Epoch 5: loss=0.6906, acc=0.55
NONSEQUENTIAL TRAIN, Epoch 5: loss=0.6907, acc=0.55
---> SEQUENTIAL TEST: Epoch 5: loss=0.6903, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 5: loss=0.7088, acc=0.49920
SEQUENTIAL TRAIN, Epoch 6: loss=0.6807, acc=0.54
NONSEQUENTIAL TRAIN, Epoch 6: loss=0.6811, acc=0.54
---> SEQUENTIAL TEST: Epoch 6: loss=0.6815, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 6: loss=0.7058, acc=0.49920
SEQUENTIAL TRAIN, Epoch 7: loss=0.6698, acc=0.52
NONSEQUENTIAL TRAIN, Epoch 7: loss=0.6702, acc=0.52
---> SEQUENTIAL TEST: Epoch 7: loss=0.6729, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 7: loss=0.7033, acc=0.49920
SEQUENTIAL TRAIN, Epoch 8: loss=0.6710, acc=0.67
NONSEQUENTIAL TRAIN, Epoch 8: loss=0.6722, acc=0.60
---> SEQUENTIAL TEST: Epoch 8: loss=0.6643, acc=0.56120
---> NONSEQUENTIAL TEST: Epoch 8: loss=0.7014, acc=0.49920
SEQUENTIAL TRAIN, Epoch 9: loss=0.6642, acc=0.71
NONSEQUENTIAL TRAIN, Epoch 9: loss=0.6659, acc=0.65
---> SEQUENTIAL TEST: Epoch 9: loss=0.6612, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 9: loss=0.6999, acc=0.49920
SEQUENTIAL TRAIN, Epoch 10: loss=0.6593, acc=0.68
NONSEQUENTIAL TRAIN, Epoch 10: loss=0.6613, acc=0.68
---> SEQUENTIAL TEST: Epoch 10: loss=0.6570, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 10: loss=0.6988, acc=0.49920
SEQUENTIAL TRAIN, Epoch 11: loss=0.6522, acc=0.68
NONSEQUENTIAL TRAIN, Epoch 11: loss=0.6541, acc=0.68
---> SEQUENTIAL TEST: Epoch 11: loss=0.6540, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 11: loss=0.6978, acc=0.49920
SEQUENTIAL TRAIN, Epoch 12: loss=0.6651, acc=0.67
NONSEQUENTIAL TRAIN, Epoch 12: loss=0.6679, acc=0.67
---> SEQUENTIAL TEST: Epoch 12: loss=0.6511, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 12: loss=0.6971, acc=0.49920
SEQUENTIAL TRAIN, Epoch 13: loss=0.6617, acc=0.67
NONSEQUENTIAL TRAIN, Epoch 13: loss=0.6640, acc=0.67
---> SEQUENTIAL TEST: Epoch 13: loss=0.6494, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 13: loss=0.6964, acc=0.49920
SEQUENTIAL TRAIN, Epoch 14: loss=0.6506, acc=0.67
NONSEQUENTIAL TRAIN, Epoch 14: loss=0.6527, acc=0.67
---> SEQUENTIAL TEST: Epoch 14: loss=0.6470, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 14: loss=0.6961, acc=0.49920
SEQUENTIAL TRAIN, Epoch 15: loss=0.6479, acc=0.69
NONSEQUENTIAL TRAIN, Epoch 15: loss=0.6500, acc=0.69
---> SEQUENTIAL TEST: Epoch 15: loss=0.6453, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 15: loss=0.6954, acc=0.49920
SEQUENTIAL TRAIN, Epoch 16: loss=0.6441, acc=0.70
NONSEQUENTIAL TRAIN, Epoch 16: loss=0.6461, acc=0.70
---> SEQUENTIAL TEST: Epoch 16: loss=0.6445, acc=0.68860
---> NONSEQUENTIAL TEST: Epoch 16: loss=0.6950, acc=0.49920
...
I’ve found that the issue lies in the BatchNorm1d
layer, because when I take it out of the model the issue disappears. Is there a difference between BatchNorm1d
in a sequential block? Or have I made a mistake I am overlooking? Thank you in advance for any help!
Here’s the full code:
import torch
from torch.utils import data
from torch.optim import Adam
from tqdm import tqdm
class Dataset(data.Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, data, labels):
self.labels = labels
self.data = data
def __len__(self):
'Denotes the total number of samples'
return len(self.labels)
def __getitem__(self, index):
'Generates one sample of data'
# Load data and get label
X = self.data[index]
y = self.labels[index]
return X, y
class Net(nn.Module):
def __init__(self, hidden_dim, in_dim, use_sequential):
super(Net, self).__init__()
self.use_sequential = use_sequential
self.in_dim = in_dim
self.hidden_dim = hidden_dim
self.sig = nn.Sigmoid()
self.encoder = nn.Sequential(
nn.Linear(in_dim, in_dim),
nn.BatchNorm1d(in_dim),
nn.Linear(in_dim, hidden_dim)
)
self.decoder = nn.Sequential(
nn.Linear(hidden_dim, in_dim),
nn.BatchNorm1d(in_dim),
nn.Linear(in_dim, in_dim)
)
self.lin1 = nn.Linear(in_dim, in_dim)
self.lin1.weight.data.fill_(0.01)
self.lin1.bias.data.fill_(0.01)
self.batchnorm = nn.BatchNorm1d(in_dim)
self.batchnorm.weight.data.fill_(0.01)
self.batchnorm.bias.data.fill_(0.01)
self.lin2 = nn.Linear(in_dim, hidden_dim)
self.lin2.weight.data.fill_(0.01)
self.lin2.bias.data.fill_(0.01)
self.lin3 = nn.Linear(hidden_dim, in_dim)
self.lin3.weight.data.fill_(0.01)
self.lin3.bias.data.fill_(0.01)
self.lin4 = nn.Linear(in_dim, in_dim)
self.lin4.weight.data.fill_(0.01)
self.lin4.bias.data.fill_(0.01)
def encode(self, x):
if self.use_sequential:
x = self.encoder(x)
else:
x = self.lin1(x)
x = self.batchnorm(x)
x = self.lin2(x)
return x
def decode(self, x):
if self.use_sequential:
x = self.decoder(x)
else:
x = self.lin3(x)
x = self.batchnorm(x)
x = self.lin4(x)
return x
def forward(self, x):
x = self.encode(x)
x = self.decode(x)
x = self.sig(x) # Sigmoid for BCELoss
return x
def accuracy(preds, labels):
acc2 = 1 - torch.sum(torch.abs(preds-labels)).item() / (list(preds.size())[0]*list(preds.size())[1])
return acc2
def generate_data(block_size):
train_data = torch.randint(2, (10000, block_size)).float()
test_data = torch.randint(2, (2500, block_size)).float()
train_labels = train_data
test_labels = test_data
return train_data, train_labels, test_data, test_labels
def init_weights(m):
if type(m) == nn.Linear or type(m) == nn.BatchNorm1d:
m.weight.data.fill_(0.01)
m.bias.data.fill_(0.01)
if type(m) == nn.PReLU:
m.weight.data.fill_(0.01)
########################## Train code ####################
IN_DIM = 4
HIDDEN_DIM = 32
EPOCHS = 200
BATCH_SIZE = 256
# Generate data
train_data, train_labels, test_data, test_labels = generate_data(IN_DIM)
# Data loading
params = {'batch_size': BATCH_SIZE,
'shuffle': True,
'num_workers': 8}
training_set = Dataset(train_data, train_labels)
training_loader = torch.utils.data.DataLoader(training_set, **params)
# Sequential and non-sequential models
model_seq = Net(hidden_dim=HIDDEN_DIM, in_dim=IN_DIM, use_sequential=True)
model_non = Net(hidden_dim=HIDDEN_DIM, in_dim=IN_DIM, use_sequential=False)
model_seq.apply(init_weights)
model_non.apply(init_weights)
loss_fn = nn.BCEWithLogitsLoss()
optimizer_seq = Adam(model_seq.parameters(), lr=0.001)
optimizer_non = Adam(model_non.parameters(), lr=0.001)
# Training
for epoch in range(EPOCHS):
model_seq.train()
model_non.train()
for batch_idx, (batch, labels) in enumerate(training_loader):
# Testing sequential model
output_seq = model_seq(batch)
loss_seq = loss_fn(output_seq, labels)
optimizer_seq.zero_grad()
loss_seq.backward()
optimizer_seq.step()
# Testing non-sequential model
output_non = model_non(batch)
loss_non = loss_fn(output_non, labels)
optimizer_non.zero_grad()
loss_non.backward()
optimizer_non.step()
if batch_idx % (BATCH_SIZE-1) == 0:
pred_seq = torch.round(output_seq)
acc_seq = accuracy(pred_seq, labels)
print('SEQUENTIAL TRAIN, Epoch %2d: loss=%.4f, acc=%.2f' % (epoch, loss_seq.item(), acc_seq))
pred_non = torch.round(output_non)
acc_non = accuracy(pred_non, labels)
print('NONSEQUENTIAL TRAIN, Epoch %2d: loss=%.4f, acc=%.2f' % (epoch, loss_non.item(), acc_non))
# Sequential Validation
model_seq.eval()
val_output_seq = model_seq(test_data)
val_loss_seq = loss_fn(val_output_seq, test_labels)
val_pred_seq = torch.round(val_output_seq)
val_acc_seq = accuracy(val_pred_seq, test_labels)
print('---> SEQUENTIAL TEST: Epoch %2d: loss=%.4f, acc=%.5f' % (epoch, val_loss_seq.item(), val_acc_seq))
model_seq.train()
# Nonsequential Validation
model_non.eval()
val_output_non = model_non(test_data)
val_loss_non = loss_fn(val_output_non, test_labels)
val_pred_non = torch.round(val_output_non)
val_acc_non = accuracy(val_pred_non, test_labels)
print('---> NONSEQUENTIAL TEST: Epoch %2d: loss=%.4f, acc=%.5f' % (epoch, val_loss_non.item(), val_acc_non))
model_non.train()
print('\n')
Thanks for the code.
The difference is indeed in the usage of the batch norm layers.
While you are initializing two different nn.BatchNorm1d
layers in your sequential approach, you are reusing the same in your manual approach.
Could you create two separate batch norm layers and run the test again?
Ah thank you! That fixed it for me, thanks again for the help!