Number of classes in a LSTM network when trying to predict word embedding vectors

import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
import os
import pandas as pd
import argparse
from typing import List
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings

# Hyperparameters
input_size = 1
hidden_size = 64
num_layers = 2
num_epochs = 300
batch_size = 1
model_dir = 'model'
log = 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs)

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_keys):
        super(Model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_keys)

    def forward(self, input):
        h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size)
        out, _ = self.lstm(input, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('-num_classes', type=int, default=2148)
    parser.add_argument('-num_layers', default=2, type=int)
    parser.add_argument('-hidden_size', default=64, type=int)
    parser.add_argument('-window_size', default=10, type=int)
    args = parser.parse_args()
    num_classes = args.num_classes
    num_layers = args.num_layers
    hidden_size = args.hidden_size
    window_size = args.window_size

    model = Model(input_size, hidden_size, num_layers, num_classes)
    dataset = torch.load('words.pt')
    #dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    writer = SummaryWriter(logdir='log/' + log)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # Train the model
    total_step = len(dataset)
    for epoch in range(num_epochs):  # Loop over the dataset multiple times
        train_loss = 0
        for step, vec in enumerate(dataset):
            # Forward pass
            vec = vec.clone().detach().view(-1, 1, input_size)
            output = model(vec)
            loss = criterion(output, dataset[step + 1].long())
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
        print('Epoch [{}/{}], Train_loss: {:.4f}'.format(epoch + 1, num_epochs, train_loss / len(dataset)))
        writer.add_scalar('train_loss', train_loss / len(dataset), epoch + 1)
    if not os.path.isdir(model_dir):
        os.makedirs(model_dir)
    torch.save(model.state_dict(), model_dir + '/' + log + 'part2.pt')
    writer.close()
    print('Finished Training')

So what am I trying to accomplish here:
I have a lot of sentences that are made up of words. I parse these words into word embedding vectors, using Flair and stacked embeddings “Glove” and “Flair News Forward”. This produces a nice vector of size

torch.Size([1, 2148])

for every word.

What I am trying to do now, is feed these vectors that represent the words into a LSTM network. I am trying to achieve that the LSTM gives me predictions of what the next word could be in the sentence.
I have left the batch size at 1, and I’m not using batching yet, because the other stuff needs to be fixed first.
So currently I am receiving the following error:

Traceback (most recent call last):
File “/Users/haraldott/Development/thesis/anomaly_detection_main/venv/lib/python3.7/site-packages/IPython/core/interactiveshell.py”, line 3325, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File “”, line 1, in
runfile(‘/Users/haraldott/Development/thesis/anomaly_detection_main/loganaliser/model_train.py’, wdir=‘/Users/haraldott/Development/thesis/anomaly_detection_main/loganaliser’)
File “/Applications/PyCharm.app/Contents/helpers/pydev/_pydev_bundle/pydev_umd.py”, line 197, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File “/Applications/PyCharm.app/Contents/helpers/pydev/_pydev_imps/_pydev_execfile.py”, line 18, in execfile
exec(compile(contents+“\n”, file, ‘exec’), glob, loc)
File “/Users/haraldott/Development/thesis/anomaly_detection_main/loganaliser/model_train.py”, line 118, in
loss = criterion(output, dataset[step + 1].long())
File “/Users/haraldott/Development/thesis/anomaly_detection_main/venv/lib/python3.7/site-packages/torch/nn/modules/module.py”, line 547, in call
result = self.forward(*input, **kwargs)
File “/Users/haraldott/Development/thesis/anomaly_detection_main/venv/lib/python3.7/site-packages/torch/nn/modules/loss.py”, line 916, in forward
ignore_index=self.ignore_index, reduction=self.reduction)
File “/Users/haraldott/Development/thesis/anomaly_detection_main/venv/lib/python3.7/site-packages/torch/nn/functional.py”, line 1995, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File “/Users/haraldott/Development/thesis/anomaly_detection_main/venv/lib/python3.7/site-packages/torch/nn/functional.py”, line 1824, in nll_loss
ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: Assertion `cur_target >= 0 && cur_target < n_classes’ failed. at …/aten/src/THNN/generic/ClassNLLCriterion.c:94

So I’m 100% sure that there’s something wrong with the number of classes (> num_classes in the code), it’s not supposed to be 2148. But I can’t figure it out.
Also I’m not sure if there might be another error somewhere in between.
If someone could give me some hints, I would be very thankful.