Whatever i give input its predict same value; example:
Input:
[90, 91, 26, 62, 92, 93, 26, 94, 95, 96]
incumbering soil and washed into immediate and glittering popularity possibly
Masked Input:
[90, 91, 26, 62, 92, 93, 26, 1, 95, 96]
incumbering soil and washed into immediate and unnk popularity possibly
Output:
[90, 91, 26, 62, 92, 93, 26, 33, 95, 96]
incumbering soil and washed into immediate and the popularity possibly
As you can see like this, it always predict “the” token.
Model:
class Kemal(nn.Module):
def __init__(self, src_vocab_size, embedding_size, num_heads, dim_forward, num_encoder_layers, max_len, src_pad_idx, dropout, device):
super(Kemal, self).__init__()
self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
self.src_position_embedding = nn.Embedding(max_len, embedding_size)
self.device = device
self.encoder_norm = nn.LayerNorm(embedding_size)
self.encoder_layer = nn.TransformerEncoderLayer(embedding_size, num_heads, dim_feedforward=dim_forward, dropout=dropout, activation='gelu')
self.encoder = nn.TransformerEncoder(self.encoder_layer, num_encoder_layers, self.encoder_norm)
self.fc = nn.Linear(embedding_size, src_vocab_size)
self.src_pad_idx = src_pad_idx
def make_src_pad_mask(self, src):
src_mask = src.transpose(0, 1) == self.src_pad_idx
return src_mask
# (N, src_len)
def forward(self, src):
src_seq_lenght, N = src.shape
src_mask = nn.Transformer.generate_square_subsequent_mask(None, src_seq_lenght).to(self.device)
src_positions = (
torch.arange(0, src_seq_lenght).unsqueeze(1).to(self.device)
)
embed_src = (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
src_padding_mask = self.make_src_pad_mask(src)
out = self.encoder(embed_src, mask=src_mask, src_key_padding_mask=src_padding_mask)
out = self.fc(out)
return out
Thanks in advance