Here is the code I have been referring to
skip gram source code
class skipgram(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(skipgram, self).__init__()
self.u_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
self.v_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
self.embedding_dim = embedding_dim
self.init_emb()
def init_emb(self):
initrange = 0.5 / self.embedding_dim
self.u_embeddings.weight.data.uniform_(-initrange, initrange)
self.v_embeddings.weight.data.uniform_(-0, 0)
def forward(self, u_pos, v_pos, v_neg, batch_size):
embed_u = self.u_embeddings(u_pos)
embed_v = self.v_embeddings(v_pos)
score = torch.mul(embed_u, embed_v)
score = torch.sum(score, dim=1)
log_target = F.logsigmoid(score).squeeze()
neg_embed_v = self.v_embeddings(v_neg)
neg_score = torch.bmm(neg_embed_v, embed_u.unsqueeze(2)).squeeze()
neg_score = torch.sum(neg_score, dim=1)
sum_log_sampled = F.logsigmoid(-1*neg_score).squeeze()
loss = log_target + sum_log_sampled
return -1*loss.sum()/batch_size
As we can see the author of the code has directly taken the sigmoid of the scores along with negative sigmoid adds them takes the mean and sends if off…
I could have understood if we would have a cross-entropy or NLLloss over here but there is nothing as such…
does anyone have an intuitive explanation on why this works (because it does)
thank you