Hi! I want to experiment with a simple transformer model and as a test, I am trying to make it overfit a single sample. Although I have followed the tutorial and most of my code is copy-pasted, the model can’t even learn 1 sample. I looked at other similar questions on the forum but none solved the issue. Here is my code:
def _create_padding_mask(inp: torch.Tensor, padding_idx: int):
return (inp == padding_idx).bool()
class simpleTransformer(nn.Module):
def __init__(self, emb_dim, vocab_size_src, vocab_size_tgt, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, pos_dropout, trans_dropout, tgt_padding_idx, streamFormat, max_seq_length = 1000):
super().__init__()
self.embed_src = nn.Embedding(vocab_size_src, emb_dim)
self.emb_dim = emb_dim
self.tgt_mask = None
self.tgt_padding_idx = tgt_padding_idx
self.embed_tgt = nn.Embedding(vocab_size_tgt, emb_dim)
self.pos_enc = PositionalEncoding(emb_dim, pos_dropout, max_seq_length)
self.transformer = nn.Transformer(emb_dim, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, trans_dropout)
self.fc = nn.Linear(emb_dim, vocab_size_tgt)
def forward(self, src, tgt): #, tgt_key_padding_mask, memory_key_padding_mask,
if self.tgt_mask is None:
self.tgt_mask = self._create_square_subsequent_mask(tgt.size(1)).to(tgt.device)
tgt_padding_mask = _create_padding_mask(tgt, self.tgt_padding_idx).to(tgt.device)
src = self.pos_enc(self.embed_src(src).transpose(0, 1) * math.sqrt(self.emb_dim))
tgt = self.pos_enc(self.embed_tgt(tgt).transpose(0, 1) * math.sqrt(self.emb_dim))
output = self.transformer(src, tgt, tgt_mask = self.tgt_mask, tgt_key_padding_mask = tgt_padding_mask) #src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask
return self.fc(output)
def _create_square_subsequent_mask(self, dim):
"""Target mask. Prevents decoder from attending to future positions."""
mask = (torch.triu(torch.ones(dim, dim)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
print("List of improvements: \n padding mask \n beamsearch")
class PositionalEncoding(nn.Module):
"""Absolute Positional encoding. Slightly modified version from a pytorch example. """
def __init__(self, emb_dim, dropout=0.1, max_len=1000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, emb_dim)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, emb_dim, 2).float() * (-math.log(10000.0) / emb_dim))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)[:, :(pe[:, 1::2]).size(1)]
self.pe = pe.unsqueeze(0).transpose(0, 1)
# self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :, :]
return self.dropout(x)
The model systematically outputs the same label. I have checked the attention mask already, but maybe I wired it incorrectly? Do you see something wrong with the model itself?