Loss function input in multiclass text classification using RNN

I am trying to implement multiclass classification using RNN. There are 9 categories say: ‘GRASS’,‘POLISH’…,etc.
How to give true input in criterion loss?
loss1 = criterion(output,true)

  1. Should the true labels in criterion be one hot encoding? OR just the number like 0,1,2…!
  2. Should the the argmax be taken for output?
num_classes = 9
num_epochs = 2
batch_size = 1
learning_rate = 0.001

input_size = embeded_train.shape[2]
sequence_length = embeded_train.shape[1]
hidden_size = 128
num_layers = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        out, _ = self.rnn(x, h0)  
        out = out[:, -1, :]
        out1 = self.fc(out)
        return out1

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
  for i in range(0,input_size):
    input = embeded_train[i].reshape(-1, 84, 300).to(device)
    output = model(input)
    loss1 = criterion(output,l)

Do the labels every have multiple classes in them. For example is there any label that is both grass and polish or are the always only one or the other. If some have multiple classes then yes you should use one hot encoding otherwise you do not have to. Also do not use argmax for the output unless you want to see the prediction.

1 Like

It helped. Thank you
I am novice in pytorch. I have written a simple RNN with glove embedding today, but the loss is not decreasing. It will be helpful if you find some time for suggessions.

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import numpy as np


import pickle

with open("/content/word2index.pickle",'rb') as out:
    word2index = pickle.load(out)


with open("/content/train_data.pickle",'rb') as out:
    train = pickle.load(out)

with open("/content/val_data.pickle",'rb') as out:
    val = pickle.load(out)

with open("/content/test_data.pickle",'rb') as out:
    test = pickle.load(out)

with open("/content/word2index.pickle",'rb') as out:
    word2index = pickle.load(out)

import pickle
with open("/content/dic.pickle",'rb') as out:
    model=pickle.load(out)
embedding_matrix = np.zeros((len(word2index) + 1, 300))
for k,v in model.items():
    embedding_matrix[k]=v

weight = torch.FloatTensor(embedding_matrix)#customized glove embeddings

embedding = nn.Embedding.from_pretrained(weight)

MAXLEN=0
def sequence(word2index,mode):   #returns index for words and its y label
  sequence=[]
  y_seq=[]
  for s in mode:
    text = s.split('__split__')[0]
    y = s.split('__split__')[1]
    one_seq=[]
    words= text.split(' ')
    global MAXLEN
    if(len(words)>MAXLEN):
      MAXLEN= len(words)

    for w in words:
      if (w in word2index.keys()):
        one_seq.append(word2index[w])
      else:
        one_seq.append(1)
    sequence.append(one_seq)
    y_seq.append(y)
  return sequence,y_seq

sequence,y_seq = sequence(word2index,train)

from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(sequence, maxlen=MAXLEN,padding='post',truncating='post')
input =torch.LongTensor(X)
embeded_train = embedding(input)

batch_size = 32
dataset = TensorDataset(embeded_train, labels)
loader = DataLoader(dataset, batch_size=batch_size)

num_classes = 9
num_epochs = 10

learning_rate = 0.001

input_size = embeded_train.shape[2]
sequence_length = embeded_train.shape[1]
hidden_size = 128
num_layers = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        out, _ = self.rnn(x, h0) 
        out = out[:, -1, :]
        out1 = self.fc(out)
        return out1

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
  for x,y in loader:
    output = model(x)
    loss = criterion(output,y)
    acc = binary_accuracy(predictions, batch.Label)
    loss.backward()
    optimizer.zero_grad()
    optimizer.step()
    if (epoch+1) % 5 == 0:
      print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

Sorry I couldn’t find it in your code are your labels one hot encoded or no?

No its in this format:
tensor([2, 2, 2, 7, 2, 3, 1, 6, 3, 7, 2, 3, 2, 3, 3, 3, 8, 4, 8, 3, 3, 3, 3, 7,
4, 3, 4, 8, 3, 8, 3, 6])

Total labels are 9. from 0 to 8

When you compute the loss try something like this

            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))

end then in between you two for loops do this

for epoch in range(num_epochs):
  train_loss = 0.
  for x,y in loader:

then instead of using the if statement for you loss function just put it on the outer epoch for loop so it prints every epoch and change the loss.item() to train loss.

for epoch in range(num_epochs):
  train_loss = 0.
  for x,y in loader:
    output = model(x)
    loss = criterion(output,y)
    acc = binary_accuracy(predictions, batch.Label)
    loss.backward()
    optimizer.zero_grad()
    optimizer.step()
    train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))

  print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

something like that. And run it for a few epochs to see if the loss changes.