Loss and accuracy values stagnating after a few epochs

I am fairly new to NLP. I was trying to learn word embeddings using word2vec on one of my favorite manga series. The problem I am facing is that the loss and accuracy values are stagnating after a few epochs. I need some guidance on how to increase the accuracy values and decrease the loss value.
I also tried to extract the embeddings and visualized the results for main characters and the result is random.

Below is a snapshot of the code and the loss and accuracy plots.

class NGramDoc2vec(nn.Module):
    def __init__(self, word_vocab_size, doc_vocab_size, embedding_dim, context_size):
        super(NGramDoc2vec, self).__init__()
        self.word_embeddings = nn.Embedding(word_vocab_size, embedding_dim)
        self.doc_embedings = nn.Embedding(doc_vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim, 128)
        self.linear2 = nn.Linear(128, word_vocab_size)
    def forward(self, inputs):
        word_embed = self.word_embeddings(inputs[:, :-1])
        word_embed = word_embed.view(word_embed.shape[0], -1)
#         doc_embed = self.doc_embedings(inputs[:, -1])
#         print(word_embed.shape, doc_embed.shape, doc_embed.unsqueeze(1).shape)
#         embeds = torch.cat([word_embed, doc_embed.unsqueeze(1)], axis = 1).view(word_eb.shape[0], -1)
        out = F.relu(self.linear1(word_embed))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
#     def get_doc_embedings(self, inputs):
#         doc_embed = self.doc_embedings(inputs).view((inputs.shape[0], -1))
#         return doc_embed
    def get_word_embedings(self, inputs):
        word_embed = self.word_embeddings(inputs).view((1, -1))
        return word_embed

# get the current value of learning rate
def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']

#number of correct predictions per batch
def metrics_batch(output, target):
    # get output class
    pred = output.argmax(dim=1, keepdim=True)
    # compare output class with target class
    return corrects

# loss value per batch
def loss_batch(loss_func, output, target, opt=None):
    loss = loss_func(output, target)
    with torch.no_grad():
        metric_b = metrics_batch(output, target)
    if opt is not None:
    return loss.item(), metric_b

# loss value and performance metric for the entire dataset
def loss_epoch(model, loss_func, dataset_dl, sanity_check=False, opt=None):
    running_loss = 0.0
    running_metric = 0.0
    len_data = len(dataset_dl.dataset)
    for xb, yb in dataset_dl:
        # move batch to device
        xb = xb.to(device)
        yb = yb.to(device)
        # get model output
        output = model(xb)
        # get loss per batch
        loss_b, metric_b = loss_batch(loss_func, output, yb, opt)
        # update running loss
        # update running metric
        if metric_b is not None:
        # break the loop in case sanity check
        if sanity_check == True:
    # average loss value
    # average metric value
    return loss, metric

def train_val(model, params):
    # extract model parameters
    num_epochs = params['num_epochs']
    loss_func = params['loss_func']
    opt = params['optimizer']
    train_dl = params['train_dl']
    val_dl = params['val_dl']
    sanity_check = params['sanity_check']
    lr_scheduler = params['lr_scheduler']
    path2weights = params['path2weights']
    # history of loss values per epoch
    loss_history = {
        'val': []
    # history of metric values in each epoch
    metric_history = {
        'train': [], 
        'val': []
    # create a copy of the state_dict
    # a deep copy of weights for the best performing model
    best_model_wts = copy.deepcopy(model.state_dict())
    # initialize the best loss to a large value
    best_loss = float('inf')
    # main loop 
    for epoch in range(num_epochs):
        # get current learnings
        current_lr = get_lr(opt)
        print('Epoch {}/{}, current_lr {}'.format(epoch, num_epochs-1, current_lr))
        # train model on training dataset
        train_loss, train_metric = loss_epoch(model, loss_func, train_dl, sanity_check, opt)
        # collect loss and metric for training dataset
        # evaluate model on the validation dataset
        with torch.no_grad():
            val_loss, val_metric = loss_epoch(model, loss_func, val_dl, sanity_check)
        # collect loss and metric for validation dataset
        # store the best model
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            # store weights into a local file
            torch.save(model.state_dict(), path2weights)
            print('copied best model weights!')
        # learning rate schedule
        if current_lr != get_lr(opt):
            print('Loading best model weights!')
        print('train losss: %.6f, dev loss: %.6f, accuracy: %.2f' \
              %(train_loss, val_loss, 100*val_metric))
    # load best model weights
    return model, loss_history, metric_history

loss_func = nn.NLLLoss(reduction='sum')
model = NGramDoc2vec(len(word_to_index), len(doc_to_index), EMBEDDING_DIM, CONTEXT_SIZE)
opt = optim.SGD(model.parameters(), lr=0.001)
lr_scheduler = ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=20, verbose=1)

model = model.to(device)

params_train = {
    'num_epochs': 100, 
    'optimizer' : opt, 
    'loss_func' : loss_func, 
    'train_dl' : train_loader, 
    'val_dl' : val_loader, 
    'sanity_check' : False, 
    'lr_scheduler' : lr_scheduler, 
    'path2weights' : './mdl/word2vec/weights.pt', 

model, loss_hist, metric_hist = train_val(model, params_train)

I think I am missing something here and am not able to pinpoint the same. It would be great if you could help me in understanding it.

I haven’t checked your model or anything, this is just a general comment:

The CBOW or Skip-Gram model of Word2Vec is unlikely to yield high accuracies. Although both models are designed a word given a context, or vice versa, that’s the really the primary tasks – that is, you don’t use Word2Vec to predict a word or a context in practice.

A high accuracy would imply, say for CBOW, that given a context you can predict the word with high probability. Language is just the flexible for that to happen. Sure, you try to force the network to learn to predict the word but “only” to get useful word vectors.

In short, for Word2Vec – and for embedding solutions in general, I would assume, the absolute accuracy of the model is not really meaningful.

Thanks Chris,

The task I am training is given the context I am training the middle word (missing word). This I am doing on the summaries I scraped for a manga series. The end goal is to visualise the embeddings and look up a character and find similar words used for the character. I was using accuracy as a proxy of how good the model is in terms of generating the embeddings. The other way is to measure loss.
After training for 100 epochs I find that the loss is stagnating.
Also I generated embeddings and tried to visualise the words and find smilar words for few characters in the series. The results I found was suboptimal. I thought I am missing something in my model.