Problemn glove implementation in Pytorch

giturra · July 20, 2022, 4:15pm

Hi!

I’m trying to implement a glove PyTorch implementation. Here is the github link. But when I evaluate the embeddings trained, these have a terrible performance compared to this glove library

Here are the main componet of my code:

The model:

class Glove(nn.Module):

    def __init__(self, vocab_size, embedding_size, x_max=100, alpha=0.75):
        super().__init__()
        self.weight = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_size,
            sparse=True
        )
        self.weight_tilde = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_size,
            sparse=True
        )
        self.bias = nn.Parameter(
            torch.randn(
                vocab_size,
                dtype=torch.float,
            )
        )
        self.bias_tilde = nn.Parameter(
            torch.randn(
                vocab_size,
                dtype=torch.float,
            )
        )
        self.weighting_func = lambda x: (x / x_max).float_power(alpha).clamp(0, 1)
    
    def forward(self, i, j, x):
        loss = torch.mul(self.weight(i), self.weight_tilde(j)).sum(dim=1)
        loss = (loss + self.bias[i] + self.bias_tilde[j] - x.log()).square()
        loss = torch.mul(self.weighting_func(x), loss).mean()
        return loss

How the coorcurrence matrix is build:



def generate_coocurence_matrix(corpus, word2idx, window_size):
    vocab_size = len(word2idx.keys())
    co_occ_mat = lil_matrix((vocab_size, vocab_size))
    for region in tqdm(corpus):
        text2token = list(map(lambda word: word2idx[word], region))
        for left_context, word, right_context in _context_windows(text2token, window_size, window_size):
            #print(left_context, word, right_context)
            for i, context_word in enumerate(left_context[::-1]):
                # add (1 / distance from focal word) for this pair
                co_occ_mat[word, context_word] += 1 / (i + 1)
            for i, context_word in enumerate(right_context):
                co_occ_mat[word, context_word] += 1 / (i + 1)
    return co_occ_mat

def _context_windows(region, left_size, right_size):
    """generate left_context, word, right_context tuples for each region
    Args:
        region (str): a sentence
        left_size (int): left windows size
        right_size (int): right windows size
    """

    for i, word in enumerate(region):
        start_index = i - left_size
        end_index = i + right_size
        left_context = _window(region, start_index, i - 1)
        right_context = _window(region, i + 1, end_index)
        yield (left_context, word, right_context)

def _window(region, start_index, end_index):
    """Returns the list of words starting from `start_index`, going to `end_index`
    taken from region. If `start_index` is a negative number, or if `end_index`
    is greater than the index of the last word in region, this function will pad
    its return value with `NULL_WORD`.
    Args:
        region (str): the sentence for extracting the token base on the context
        start_index (int): index for start step of window
        end_index (int): index for the end step of window
    """
    last_index = len(region) + 1
    selected_tokens = region[max(start_index, 0):
                             min(end_index, last_index) + 1]
    return selected_tokens

and the trainer:



trainset = TensorDataset(
    torch.tensor(data['i']), 
    torch.tensor(data['j']),
    torch.tensor(data['w'])
)

SIZE_VOCAB = data['vocab_size']
word2idx = data['word2idx']

device = torch.device('cuda')

dataloader = DataLoader(trainset, batch_size=1024, shuffle=True)

model = Glove(len(word2idx.keys()), 100)
model.to(device)

optimizer = torch.optim.Adagrad(model.parameters(), lr=0.05)

epoch = 3

lossess = []
c = 0
for _ in range(epoch):

    for batch in tqdm(dataloader):

        i = batch[0].to(device)
        j = batch[1].to(device)
        w = batch[2].to(device)

        optimizer.zero_grad()

        loss = model(i, j, w)
        #print(loss)
        lossess.append(loss.item())
        loss.backward()
        optimizer.step()
        c += 1
torch.save(model.state_dict(), './model/glove_model.pt')

If someone notes something strange or theoretically incorrect, please let me know because I don’t know why the performance it’s so low.

I evaluate the model using the MEN dataset for similarity using the word embedding benckmark library.