Arr = [self.vocab.stoi[x] for x in arr] KeyError: 'hinton city official confirm multiple building fire

I’m trying to train a model to classify tweets whether they were talking about disaster or not.

anyway I got this problem that I didn’t understand what does it tell me to fix.

Traceback (most recent call last):
  File "C:/Users/BHAAK/Desktop/ML_PATH/dirty-hands/dirty-hands file 3/Project.py", line 333, in <module>
    valid_loss, valid_acc = evaluate(model, val_set, criterion)
  File "C:/Users/BHAAK/Desktop/ML_PATH/dirty-hands/dirty-hands file 3/Project.py", line 307, in evaluate
    for batch in iterator:
  File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\iterator.py", line 156, in __iter__
    yield Batch(minibatch, self.dataset, self.device)
  File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\batch.py", line 34, in __init__
    setattr(self, name, field.process(batch, device=device))
  File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\field.py", line 237, in process
    tensor = self.numericalize(padded, device=device)
  File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\field.py", line 338, in numericalize
    arr = [self.vocab.stoi[x] for x in arr]
  File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\field.py", line 338, in <listcomp>
    arr = [self.vocab.stoi[x] for x in arr]
KeyError: 'get battlefield num scream mic cunt'

here is the hole code

import torchtext.data as data
import torch
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


TEXT = data.Field(batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)
feilds = [('text', TEXT), ('target', LABEL)]

train, val = data.TabularDataset(path='processed_data.csv', format='csv', fields=feilds, skip_header=True).split()


TEXT.build_vocab(train, min_freq=5, vectors='glove.twitter.27B.50d')
LABEL.build_vocab(train)

train_set, val_set = data.BucketIterator.splits((train, val), batch_sizes=(32, 32), sort_within_batch=True,
                                        device='cpu', sort_key=lambda x: len(x.text))



class Net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.Embedding = nn.Embedding(vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        self.act = nn.Sigmoid()

    def forward(self, text, text_length):
        x = self.Embedding(text)
        x = nn.utils.rnn.pack_padded_sequence(x, text_length, True)
        x, (state, cell) = self.LSTM(x)
        x = torch.cat((state[-2, :, :], state[-1, :, :]), dim=1)
        x = self.fc(x)
        return self.act(x)

size_of_vocab = len(TEXT.vocab)
embedding_dim = 50
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = Net(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers,
                   bidirectional = True, dropout = dropout)

# Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.Embedding.weight.data.copy_(pretrained_embeddings)

import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()


def binary_accuracy(preds, y):
    # round predictions to the closest integer
    rounded_preds = torch.round(preds)

    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


def train(model, iterator, optimizer, criterion):
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # set the model in training phase
    model.train()

    for batch in iterator:
        # resets the gradients after every batch
        optimizer.zero_grad()

        # retrieve text and no. of words
        text, text_lengths = batch.text
        # convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()
        # compute the loss


        loss = criterion(predictions, batch.label)

        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)

        # backpropage the loss and compute the gradients
        loss.backward()

        # update the weights
        optimizer.step()

        # loss and accuracy
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()

    # deactivates autograd
    with torch.no_grad():
        for batch in iterator:
            # retrieve text and no. of words
            text, text_lengths = batch.text

            # convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()

            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


best_valid_loss = float('inf')

for epoch in range(20):

    # train the model
    train_loss, train_acc = train(model, train_set, optimizer, criterion)

    # evaluate the model
    valid_loss, valid_acc = evaluate(model, val_set, criterion)

    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

the problem is from the evaluate() function.

I will be pleasure to hear any opinions .

thanks

Similar to this issue it seems your data loading is failing so I would also recommend to narrow down the index of the sample and debug the Dataset manually.

1 Like

you are right, the function has no problem because I tried with the train_set so the problem is from the val_set.
but I don’t know what is wrong in the data.

can you explain what is wrong from the code:

TEXT = data.Field(batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)
feilds = [('text', TEXT), ('label', LABEL)]

train, val = data.TabularDataset(path='processed_data.csv', format='csv', fields=feilds, skip_header=True).split()

TEXT.build_vocab(train, min_freq=5, vectors='glove.twitter.27B.50d')
LABEL.build_vocab(train)

train_set, val_set = data.BucketIterator.splits((train, val), batch_sizes=(32, 32), sort_within_batch=True,
                                                device='cpu', sort_key=lambda x: len(x.text))

but I think it’s from the processing phase because the code is good in my opinion.
what do you think?

I’m not deeply familiar with torchtext, but it seems the stoi dict gets a complete sentence, while I assume only words are expected?

Could you share an executable code snippet so that we could debug?

here is the hole processing script.

notice that I processed other datasets by this function and I run my network without problems, but with this dataset I get the problem

df = pd.read_csv('quora.csv', usecols=['question_text','target'])[:10000]

def clean_text(tweet):
    import re
    from nltk.corpus import stopwords
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.stem import PorterStemmer

    # Character entity references
    tweet = re.sub(r"&?gt;", ">", tweet)
    tweet = re.sub(r"&?lt;", "<", tweet)
    tweet = re.sub(r"&?amp;", "&", tweet)

    # Typos, slang and informal abbreviations
    tweet = re.sub(r"\d*/\d*/?\d*", "y-m-d", tweet)
    tweet = re.sub("\d+:\d+", 'hour', tweet)
    tweet = re.sub(r"\d+yr", "old year", tweet)
    tweet = re.sub("#\S*", 'h-tag ', tweet)
    tweet = re.sub(r"http\S*", "URL", tweet)
    tweet = re.sub(r"\S*\@\S*", "email", tweet)
    tweet = re.sub("\s{2,}", " ", tweet)


    # Hashtags and usernames
    tweet = re.sub(r"\S*(M|m)usic\S*", "music", tweet)
    tweet = re.sub(r"(P|p)rophet (M|m)uhammad", "prophet muhammad", tweet)
    tweet = re.sub(r"\S*(L|l)ove\S*", "love", tweet)
    tweet = re.sub(r"\S* (S|s)ummer\S*", "summer", tweet)
    tweet = re.sub(r"\S*NASA\S*", "nasa", tweet)
    tweet = re.sub(r"\S*book\S*", "book", tweet)
    tweet = re.sub(r"\S*(I|i)sland\S*", "Island", tweet)
    tweet = re.sub(r"20\d\d", "year", tweet)
    tweet = re.sub(r"\S*(K|k)ing\S*", "king", tweet)
    tweet = re.sub(r"\S*(C|c)it(ies|y)\S*", "city", tweet)
    tweet = re.sub(r"\S*RT\S*", "RT", tweet)
    tweet = re.sub(r"\S*(H|h)ealth\S*", "health", tweet)
    tweet = re.sub(r"\S*(S|s)ave\S*", "save bees", tweet)
    tweet = re.sub(r"\S*(T|t)raffic\S*", "traffic", tweet)
    tweet = re.sub(r"\S*(K|k)ashmir\S*", "kashmir", tweet)
    tweet = re.sub(r"\S*(C|c)onflict\S*", "conflict", tweet)
    tweet = re.sub(r"\S*(S|s)torm\S*", "storm", tweet)
    tweet = re.sub(r"\S*(O|o)il\S*", "oil", tweet)
    tweet = re.sub(r"\S*(V|v)ideo\S*", "video", tweet)
    tweet = re.sub(r"\S*([Ff])ire\S*", "fire", tweet)
    tweet = re.sub(r"\S*(W|w)eather\S*", "weather", tweet)
    tweet = re.sub(r"\S*(S|s)un\S+", "sun", tweet)
    tweet = re.sub(r"\S*(BBC|bbc)\S*", "bbc news", tweet)
    tweet = re.sub(r"\S*(D|d)ay\S*", "day", tweet)
    tweet = re.sub(r"\S*(E|e)ffect\S*", "effect", tweet)
    tweet = re.sub(r"\S*([Tt])error\S*", "terrorism", tweet)
    tweet = re.sub(r"\S*([Ss])ocial\S*", "social", tweet)
    tweet = re.sub(r"\S*([Ww])ord\S*", "wosrd", tweet)
    tweet = re.sub(r"\S*([Aa])ccident\S*", "accident", tweet)
    tweet = re.sub(r"\S*([Ss])port\S*", "sport", tweet)
    tweet = re.sub(r"\S*([Ii])ndia", "india", tweet)
    tweet = re.sub(r"\S*UK\S*", "UK", tweet)
    tweet = re.sub(r"\S*USA?\S*", "USA", tweet)
    tweet = re.sub(r"\w*(N|n)EWS", "NEWS", tweet)
    tweet = re.sub(r"\w*(D|d)am", "dam", tweet)
    tweet = re.sub(r"\S*(V|v)ideo\S*", "video", tweet)
    tweet = re.sub(r"\w*(G|g)ames?", "Game", tweet)
    tweet = re.sub(r"\S*(Y|y)ou(T|t)ube\S*", "youtube", tweet)
    tweet = re.sub(r"hrs", "hour", tweet)
    tweet = re.sub(r"txt", "text", tweet)
    tweet = re.sub(r"\s+k\s+", "ok", tweet)
    tweet = re.sub(r"\s+b\s+", "be", tweet)
    tweet = re.sub(r"\s+u(r|s)?", "you", tweet)
    tweet = re.sub(r"hom", "home", tweet)
    tweet = re.sub(r"yous", "you", tweet)
    tweet = re.sub(r"\s+n\s+", "in", tweet)
    tweet = re.sub(r"got", "get", tweet)
    tweet = re.sub(r"gave", "give", tweet)
    tweet = re.sub(r"fr", "for", tweet)

    text = tweet.lower()

    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub("\?{2,}", " ?", text)
    text = re.sub("\!{2,}", " !", text)
    text = re.sub("\?", " ?", text)
    text = re.sub("\!", " !", text)

    tweet = text

    # Contractions
    tweet = re.sub(r"there's", "there is", tweet)
    tweet = re.sub(r"that's", "that is", tweet)
    tweet = re.sub(r"won't", "will not", tweet)
    tweet = re.sub(r"they're", "they are", tweet)
    tweet = re.sub(r"can't", "cannot", tweet)
    tweet = re.sub(r"wasn't", "was not", tweet)
    tweet = re.sub(r"isn't", "is not", tweet)
    tweet = re.sub(r"what's", "what is", tweet)
    tweet = re.sub(r"haven't", "have not", tweet)
    tweet = re.sub(r"hasn't", "has not", tweet)
    tweet = re.sub(r"there's", "there is", tweet)
    tweet = re.sub(r"he's", "he is", tweet)
    tweet = re.sub(r"it's", "it is", tweet)
    tweet = re.sub(r"you're", "you are", tweet)
    tweet = re.sub(r"i'm", "i am", tweet)
    tweet = re.sub(r"shouldn't", "should not", tweet)
    tweet = re.sub(r"wouldn't", "would not", tweet)
    tweet = re.sub(r"i'm", "i am", tweet)
    tweet = re.sub(r"in't", "is not", tweet)
    tweet = re.sub(r"here's", "here is", tweet)
    tweet = re.sub(r"you've", "you have", tweet)
    tweet = re.sub(r"we're", "we are", tweet)
    tweet = re.sub(r"what's", "what is", tweet)
    tweet = re.sub(r"couldn't", "could not", tweet)
    tweet = re.sub(r"we've", "we have", tweet)
    tweet = re.sub(r"who's", "who is", tweet)
    tweet = re.sub(r"it'll", "it will", tweet)
    tweet = re.sub(r"we'll", "we will", tweet)
    tweet = re.sub(r"We've", "we have", tweet)
    tweet = re.sub(r"he'll", "he will", tweet)
    tweet = re.sub(r"y'all", "you all", tweet)
    tweet = re.sub(r"weren't", "were not", tweet)
    tweet = re.sub(r"they'll", "they will", tweet)
    tweet = re.sub(r"they'd", "they would", tweet)
    tweet = re.sub(r"they've", "they have", tweet)
    tweet = re.sub(r"should've", "should have", tweet)
    tweet = re.sub(r"where's", "where is", tweet)
    tweet = re.sub(r"we'd", "we would", tweet)
    tweet = re.sub(r"weren't", "were not", tweet)
    tweet = re.sub(r"they're", "they are", tweet)
    tweet = re.sub(r"let's", "let us", tweet)
    tweet = re.sub(r"can't", "cannot", tweet)
    tweet = re.sub(r"you're", "you are", tweet)
    tweet = re.sub(r"that's", "that is", tweet)
    tweet = re.sub(r"doesn't", "does not", tweet)
    tweet = re.sub(r"didn't", "did not", tweet)
    tweet = re.sub(r"ain't", "am not", tweet)
    tweet = re.sub(r"you'll", "you will", tweet)
    tweet = re.sub(r"i've", "i have", tweet)
    tweet = re.sub(r"don't", "do not", tweet)
    tweet = re.sub(r"i'll", "i will", tweet)
    tweet = re.sub(r"i'd", "i would", tweet)
    tweet = re.sub(r"let's", "let us", tweet)
    tweet = re.sub(r"you'd", "you would", tweet)
    tweet = re.sub(r"it's", "it is", tweet)
    tweet = re.sub(r"ain't", "am not", tweet)
    tweet = re.sub(r"haven't", "have not", tweet)
    tweet = re.sub(r"could've", "could have", tweet)
    tweet = re.sub("\d+", ' num ', tweet)
    tweet = re.sub(r'[^a-z?!]', ' ', tweet)
    tweet = ' '.join([word for word in tweet.split() if word not in stopwords.words('english')])
    tweet = ' '.join([WordNetLemmatizer().lemmatize(word) for word in tweet.split()])
    tweet = ' '.join([PorterStemmer().stem(word) for word in tweet.split()])
    return tweet

df['question_text'] = df['question_text'].apply(lambda x: clean_text(x))

df.sample(frac=1., random_state=42).to_csv('processedr.csv', index=False)

I think the vocab should be build using the train and valid datasets combined.

1 Like

it worked thanks for help

Actually it fixed the error but this didn’t fix the real problem.

The real problem is from the embedding layer because there is words from valid dataset that has no embedding vector because I didn’t use the vocabs of the vaild dataset.

you are right if I build the vocabs using train and vaild, it will work but
when I predict a piece of text that contains a word that has no embedding vector I will get the same problem so the complete solution is:

pad_token = TEXT.vocab.stoi[TEXT.pad_token] # {'<pad>': 1}
unk_token = TEXT.vocab.stoi[TEXT.unk_token] # {'<unk>': 0}
model.embedding.weight.data[pad_token] = torch.zeros(embedding_dim) # vector for pad_token that will be given to any pad token as its embedding weight
model.embedding.weight.data[unk_token] = torch.zeros(embedding_dim) # vector for unk_token that will be given to any unkown word as its embedding weight

this code will append vectors for pad_token and unk_token with zero weight.