IndexError: Target -1 is out of bounds

JackXu9 · August 15, 2022, 7:49pm

I am trying to create a text classifier as the one found here:
https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

When I run the code I encounter the following:

line 3014, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
IndexError: Target -1 is out of bounds.

I have made sure that the number of outputs match across training, valid and test sets. The code is as follows:

import torch
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

df = pd.read_csv("cleaned_train.csv", sep="|")
df_test = pd.read_csv("cleaned_test3.csv", sep="|")

# Splitting training set into train set and test set
target_train, input_valid, y_train, input_valid = train_test_split(df['label'].tolist(),
                                                    df['content'].tolist(),
                                                    test_size=0.2,
                                                    random_state=42)
# making the data fit to the pipeline
train_iter = list(zip(target_train, y_train))
valid_dat = list(zip(input_valid, input_valid))
test_dat = list(zip(df_test['label'].tolist(), df_test['content'].tolist()))

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_iter = list(zip(target_train, y_train))
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

from torch import nn
import torch.nn.functional as F

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc1 = nn.Linear(embed_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()
        self.fc3.weight.data.uniform_(-initrange, initrange)
        self.fc3.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


train_iter = list(zip(target_train, y_train))
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 128
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)


import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count


from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# Hyperparameters
EPOCHS = 15
LR = 0.1  # learning rate
BATCH_SIZE = 16 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_iter2 = train_iter
test_iter = test_dat
valid_iter = valid_dat

train_dataloader = DataLoader(train_iter2, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_iter, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_iter, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)


for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

ptrblck · August 16, 2022, 4:03am

It seems you might be creating negative target indices while they should contain values in the range [0, nb_classes-1] here:

label_pipeline = lambda x: int(x) - 1
...
label_list.append(label_pipeline(_label))
...
label_list = torch.tensor(label_list, dtype=torch.int64)

JackXu9 · August 16, 2022, 12:17pm

Thanks!

Now the model is working, but I have trouble printing the loss values. I have added the following to the training loop just after the epoch, batches and accuracy print:

print(f'loss:{loss.item():.4f}')

However, I’m getting consistent increase in accuracy over 1 epoch but completely random loss values:

| epoch   1 |   500/ 8541 batches | accuracy    0.392
loss:2.1764
| epoch   1 |  1000/ 8541 batches | accuracy    0.493
loss:1.2350
| epoch   1 |  1500/ 8541 batches | accuracy    0.511
loss:0.6922
| epoch   1 |  2000/ 8541 batches | accuracy    0.588
loss:2.4791
| epoch   1 |  2500/ 8541 batches | accuracy    0.583
loss:1.1125
| epoch   1 |  3000/ 8541 batches | accuracy    0.578
loss:0.1560
| epoch   1 |  3500/ 8541 batches | accuracy    0.583
loss:3.8755
| epoch   1 |  4000/ 8541 batches | accuracy    0.599
loss:0.0014
| epoch   1 |  4500/ 8541 batches | accuracy    0.598
loss:2.1682
| epoch   1 |  5000/ 8541 batches | accuracy    0.611
loss:2.1124
| epoch   1 |  5500/ 8541 batches | accuracy    0.625
loss:0.1283
| epoch   1 |  6000/ 8541 batches | accuracy    0.624
loss:1.3151
| epoch   1 |  6500/ 8541 batches | accuracy    0.639
loss:0.0034
| epoch   1 |  7000/ 8541 batches | accuracy    0.666
loss:0.0003
| epoch   1 |  7500/ 8541 batches | accuracy    0.654
loss:0.5905
| epoch   1 |  8000/ 8541 batches | accuracy    0.657
loss:1.4126
| epoch   1 |  8500/ 8541 batches | accuracy    0.617
loss:0.4063

Have I missed something or is it the .item() that I’m not using correctly?

ptrblck · August 16, 2022, 4:29pm

You could try to print all batch losses and check how noisy they are generally. Maybe the general loss still decreases while printing the batch loss every 500 batches might be quite noisy.