Hi!
I’m trying to implement a glove PyTorch implementation. Here is the github link. But when I evaluate the embeddings trained, these have a terrible performance compared to this glove library
Here are the main componet of my code:
The model:
class Glove(nn.Module):
def __init__(self, vocab_size, embedding_size, x_max=100, alpha=0.75):
super().__init__()
self.weight = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=embedding_size,
sparse=True
)
self.weight_tilde = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=embedding_size,
sparse=True
)
self.bias = nn.Parameter(
torch.randn(
vocab_size,
dtype=torch.float,
)
)
self.bias_tilde = nn.Parameter(
torch.randn(
vocab_size,
dtype=torch.float,
)
)
self.weighting_func = lambda x: (x / x_max).float_power(alpha).clamp(0, 1)
def forward(self, i, j, x):
loss = torch.mul(self.weight(i), self.weight_tilde(j)).sum(dim=1)
loss = (loss + self.bias[i] + self.bias_tilde[j] - x.log()).square()
loss = torch.mul(self.weighting_func(x), loss).mean()
return loss
How the coorcurrence matrix is build:
def generate_coocurence_matrix(corpus, word2idx, window_size):
vocab_size = len(word2idx.keys())
co_occ_mat = lil_matrix((vocab_size, vocab_size))
for region in tqdm(corpus):
text2token = list(map(lambda word: word2idx[word], region))
for left_context, word, right_context in _context_windows(text2token, window_size, window_size):
#print(left_context, word, right_context)
for i, context_word in enumerate(left_context[::-1]):
# add (1 / distance from focal word) for this pair
co_occ_mat[word, context_word] += 1 / (i + 1)
for i, context_word in enumerate(right_context):
co_occ_mat[word, context_word] += 1 / (i + 1)
return co_occ_mat
def _context_windows(region, left_size, right_size):
"""generate left_context, word, right_context tuples for each region
Args:
region (str): a sentence
left_size (int): left windows size
right_size (int): right windows size
"""
for i, word in enumerate(region):
start_index = i - left_size
end_index = i + right_size
left_context = _window(region, start_index, i - 1)
right_context = _window(region, i + 1, end_index)
yield (left_context, word, right_context)
def _window(region, start_index, end_index):
"""Returns the list of words starting from `start_index`, going to `end_index`
taken from region. If `start_index` is a negative number, or if `end_index`
is greater than the index of the last word in region, this function will pad
its return value with `NULL_WORD`.
Args:
region (str): the sentence for extracting the token base on the context
start_index (int): index for start step of window
end_index (int): index for the end step of window
"""
last_index = len(region) + 1
selected_tokens = region[max(start_index, 0):
min(end_index, last_index) + 1]
return selected_tokens
and the trainer:
trainset = TensorDataset(
torch.tensor(data['i']),
torch.tensor(data['j']),
torch.tensor(data['w'])
)
SIZE_VOCAB = data['vocab_size']
word2idx = data['word2idx']
device = torch.device('cuda')
dataloader = DataLoader(trainset, batch_size=1024, shuffle=True)
model = Glove(len(word2idx.keys()), 100)
model.to(device)
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.05)
epoch = 3
lossess = []
c = 0
for _ in range(epoch):
for batch in tqdm(dataloader):
i = batch[0].to(device)
j = batch[1].to(device)
w = batch[2].to(device)
optimizer.zero_grad()
loss = model(i, j, w)
#print(loss)
lossess.append(loss.item())
loss.backward()
optimizer.step()
c += 1
torch.save(model.state_dict(), './model/glove_model.pt')
If someone notes something strange or theoretically incorrect, please let me know because I don’t know why the performance it’s so low.
I evaluate the model using the MEN dataset for similarity using the word embedding benckmark library.