I have created a very simple transformer model using PyTorch, but when I train the loss does not decrease during training as expected. I attempted to figure out where the cause was by feeding a single example to the transformer over and over again. I expected the transformer to quickly overfit, however what happens instead is that the loss does not decrease at all.
Here is my model and training code. As you can see I have basically a raw transformer model with linear layers acting as embedding layers.
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class TransformerModel(nn.Module):
def __init__(self, nhead=8, dim_feedforward=1024, num_layers=6, dropout=0.1):
super(TransformerModel, self).__init__()
self.d_model = 512
self.input_embedding1 = nn.Linear(objprocessor.MAX_INSTR_SIZE, self.d_model)
self.output_embedding1 = nn.Linear(sourcenode.NODE_ID_END, self.d_model)
self.pos_encoder = PositionalEncoding(self.d_model, dropout)
self.transformer = nn.Transformer(d_model=self.d_model, nhead=nhead, dim_feedforward=dim_feedforward,
num_encoder_layers=num_layers, num_decoder_layers=num_layers, dropout=dropout)
self.output_linear = nn.Linear(self.d_model, sourcenode.NODE_ID_END)
self.soft_max = nn.Softmax(dim=2)
def forward(self, src, tgt):
# src is a tensor of shape (S, N, objprocessor.MAX_INSTR_SIZE), where N = batch size,
# S = sequence length of input, objprocessor.MAX_INSTR_SIZE = channel size
# tgt is a tensor of shape (T, N, sourcenode.NODE_ID_END), where N = batch size,
# T = sequence length of output, sourcenode.NODE_ID_END = channel size
tgt_len = tgt.shape[0]
#tgt_mask = self.transformer.generate_square_subsequent_mask(tgt_len).cuda()
tgt_mask = self.transformer.generate_square_subsequent_mask(tgt_len)
src = self.input_embedding1(src)
src = self.pos_encoder(src)
tgt = self.output_embedding1(tgt)
tgt = self.pos_encoder(tgt)
output = self.transformer(src, tgt, tgt_mask=tgt_mask)
output = self.output_linear(output)
output = self.soft_max(output)
return output
model = TransformerModel(dropout=0.0)
#model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
model.train()
for (src, tgt) in batch_iterator(training_files):
# Feed the same example to the transformer 1000 times to test if it's learning anything
for _ in range(1000):
#src = src.cuda()
#tgt = tgt.cuda()
optimizer.zero_grad()
output = model(src, tgt)
loss = criterion(output.view(-1, sourcenode.NODE_ID_END), torch.argmax(tgt.view(-1, sourcenode.NODE_ID_END), dim=1))
print("Loss", loss)
loss.backward()
optimizer.step()
Output:
Loss tensor(6.2542, grad_fn=<NllLossBackward>)
Loss tensor(6.1925, grad_fn=<NllLossBackward>)
Loss tensor(6.1591, grad_fn=<NllLossBackward>)
Loss tensor(6.1591, grad_fn=<NllLossBackward>)
Loss tensor(6.1591, grad_fn=<NllLossBackward>)
Loss tensor(6.1590, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1589, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1584, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
Loss tensor(6.1583, grad_fn=<NllLossBackward>)
When training on actual data my model exhibits similar behavior where the loss does not decrease, even when the transformer is presented with many examples.