Loss function for batches in Sequence Tagging

I am trying Sequence Tagging for CoNLL 2003. I followed the Sequence tagging tutorial in pytorch documentation and arrived at this code. I am stuck in loss function.

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import numpy as np

#Variable Initialization
x,y = [],[]
data = []

#Hyper parameters
input_dim = 200
hidden_dim = 128
num_layers = 1
batch_size = 5
learning_rate = 0.01
num_epochs = 5

def prepare_data():
    global data
    data = open('conll_2003.txt').read()
    seq_lengths = []
    training_data= []
    x_text = ""
    y_label = ""
    l = 0
    for i,line in enumerate(data.split("\n")):
            if len(line) > 0:
                tokens = line.split(" ")
                if tokens[0] != "-DOCSTART-":
                    x_text = x_text + str(tokens[0]) + str(" ")
                    y_label = y_label + str(tokens[3]) + str(" ")
                else:
                    continue
            else:
                if len(x_text.strip().split()) < 200 and len(x_text.strip().split()) > 0 :
                    l=l+1
                    x.append(np.array(x_text.strip().split()))
                    y.append(np.array(y_label.strip().split()))
                    seq_lengths.append(len(x_text.strip().split()))
                    x_text = ""
                    y_label = ""
    return x,y

x,y = prepare_data()
x = np.array(x)
y = np.array(y)
num_batches = math.ceil(len(data)/batch_size)

word_to_ix_obj = {'<PAD>':0}
tag_to_ix_obj = {'<PAD>':0}

def word_to_ix():
    for i,item in enumerate(x):
        for word in item:
            if word not in word_to_ix_obj:
                word_to_ix_obj[str(word)] = len(word_to_ix_obj)
    
def tag_to_ix():
    for i,item in enumerate(y):
        for tag in item:
            if tag not in tag_to_ix_obj:
                tag_to_ix_obj[tag] = len(tag_to_ix_obj)

word_to_ix()
tag_to_ix()

def convert_words_to_ix(inputs,ix_map):
    text_to_ix = []
    for i,text in enumerate(inputs):
        text_to_ix.append(prepare_sequence(text,ix_map))
    return text_to_ix
    
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return idxs    
    
def prepare_batch(index):
    x,y = batch_iterator(index)
    x = convert_words_to_ix(x,word_to_ix_obj)
    y = convert_words_to_ix(y,tag_to_ix_obj)
    x_lengths = [len(sentence) for sentence in x]
    pad_token = word_to_ix_obj['<PAD>']
    longest_sentence = max(x_lengths)
    padded_x = np.ones((batch_size,longest_sentence),dtype=int) * pad_token
    for i,x_length in enumerate(x_lengths):
        temp = x[i]
        padded_x[i,0:x_length] = temp[:x_length]
    y_lengths = [len(sentence) for sentence in y]
    pad_token = tag_to_ix_obj['<PAD>']
    longest_sentence = max(y_lengths)
    padded_y = np.ones((batch_size,longest_sentence),dtype=int) * pad_token
    for i,y_length in enumerate(y_lengths):
        temp = y[i]
        padded_y[i,0:y_length] = temp[:y_length]
    return padded_x,padded_y

def batch_iterator(index):
    start_index = (index-1) * batch_size
    end_index = start_index + batch_size - 1
    return x[start_index:end_index],y[start_index:end_index]

class LSTM(nn.Module):
    
    def __init__(self,input_dim, hidden_dim, output_dim, batch_size, vocab_size, num_layers=1):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.output_dim = output_dim
        self.word_embeddings = nn.Embedding(vocab_size, input_dim)
        self.lstm = nn.LSTM(self.input_dim,self.hidden_dim,self.num_layers,batch_first=True)
        self.linear = nn.Linear(self.hidden_dim,self.output_dim)

def forward(self, x):
        input_tensor = torch.LongTensor(x)
        h0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_dim)
        c0 = torch.zeros(self.num_layers, self.batch_size, self.hidden_dim)
        embeds = self.word_embeddings(input_tensor)
        lstm_out, _ = self.lstm(embeds, (h0, c0))
        tag_space = self.linear(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

model = LSTM(input_dim, hidden_dim, len(tag_to_ix_obj), batch_size,len(word_to_ix_obj))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for i in range(num_batches):
        model.zero_grad()
        input_x,input_y = prepare_batch(i)
        tag_scores = model(input_x)
        
        print("tag_scores shape :")
        print(tag_scores.shape)
        print(tag_scores.size)
        print(tag_scores)
        
        print("\n")
        print("====================")
        print("\n")
        
        input_y = torch.from_numpy(input_y)
        
        print("input_y shape :")
        print(input_y.shape)
        print(input_y.size)
        print(input_y)

        input_y = input_y.float()
        print("type of input_y")
        print(input_y.type())
        
        loss = loss_function(tag_scores, input_y)
        loss.backward()
        optimizer.step()

I have given batch_size as 5 and 31 being the maximum sequence length.
When i run the above it outputs as follows,

and says

~/anaconda3/lib/python3.7/site-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
   1798         if target.size()[1:] != input.size()[2:]:
   1799             raise ValueError('Expected target size {}, got {}'.format(
-> 1800                 out_size, target.size()))
   1801         input = input.contiguous().view(n, c, 1, -1)
   1802         target = target.contiguous().view(n, 1, -1)

ValueError: Expected target size (5, 10), got torch.Size([5, 31])

The error indicates that the tag_scores has size (5, 10), which according to your input_y, it should have been of size (5, 10). So, since tag_scores comes from this line self.linear = nn.Linear(self.hidden_dim,self.output_dim), can you confirm what output_dim you have used to construct the model?

I set output_dim=10. I gave ‘10’ value because of the following.

Consider the following sentence,

Andrew ng is a data scientist
B-PER I-PER O O O O

For each token, it can have any of the 10 different tags like B-PER, I-PER, B-LOC, I-LOC.

I see, so then you should change the shape of input_y to match these 10 tokens as well. You should consider the length of sequence in a different way, which really depends what model you want to use, like many-to-many, many-to-one, …

Did you end up finding the solution to this?