hey guys! if anyone knows a little about word2vec, I’ve got a few questions. I’m trying to build a nlp classifier from scratch similar to the word2vec framework for my custom dataset which is a text file. I used the tutorial for N-Gram Language Modeling for pytorch (Google Colab) and just replaced the dataset with my own. my question is: How do I get my model to predict which sentence is the original and which is the corrupt? The corrupt sentence is the same as the original sentence except for one change in a word or shift in punctuation. An example few lines from the dataset is provided below: (let the | represent tab)
"
When the enemity of Ceheemban threatens to create a rift , father and son solve it and the family is happily united .|it the enemity of Ceheemban threatens to create a rift , father and son solve When and the family is happily united .|
|If you take a guide or interpreter with you, it is customary for them to receive a percentage on every purchase you make.|If you take a guide or where with you, it is customary for them to receive a percentage on every purchase you make.|
"
# Create a language model using n-grams with a neural network
# Import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# Define the model class
class NGramLanguageModeler(nn.Module):
# Initialize the model
def __init__(self, vocab_size, embedding_dim, context_size):
super(NGramLanguageModeler, self).__init__()
# Create an embedding layer
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
# First linear layer for transforming inputs
self.linear1 = nn.Linear(context_size * embedding_dim, 128)
# Second linear layer for predictions
self.linear2 = nn.Linear(128, vocab_size)
# Define the forward pass
def forward(self, inputs):
embeds = self.embeddings(inputs).view((1, -1))
out = F.relu(self.linear1(embeds))
out = self.linear2(out)
log_probs = F.log_softmax(out, dim=1)
return log_probs
# Set up training parameters
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)
# Training loop
for epoch in range(10):
total_loss = 0
for context, target in ngrams:
context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
model.zero_grad()
log_probs = model(context_idxs)
loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
loss.backward()
optimizer.step()
total_loss += loss.item()
losses.append(total_loss)
# Print the losses after training
print("Losses:")
print(losses)
# Example: Get the embedding of the word "beauty"
print("Embedding for 'beauty':")
print(model.embeddings.weight[word_to_ix["beauty"]])