I’m trying to train a model to classify tweets whether they were talking about disaster or not.
anyway I got this problem that I didn’t understand what does it tell me to fix.
Traceback (most recent call last):
File "C:/Users/BHAAK/Desktop/ML_PATH/dirty-hands/dirty-hands file 3/Project.py", line 333, in <module>
valid_loss, valid_acc = evaluate(model, val_set, criterion)
File "C:/Users/BHAAK/Desktop/ML_PATH/dirty-hands/dirty-hands file 3/Project.py", line 307, in evaluate
for batch in iterator:
File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\iterator.py", line 156, in __iter__
yield Batch(minibatch, self.dataset, self.device)
File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\batch.py", line 34, in __init__
setattr(self, name, field.process(batch, device=device))
File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\field.py", line 237, in process
tensor = self.numericalize(padded, device=device)
File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\field.py", line 338, in numericalize
arr = [self.vocab.stoi[x] for x in arr]
File "C:\Users\BHAAK\AppData\Local\Programs\Python\Python36\lib\site-packages\torchtext\data\field.py", line 338, in <listcomp>
arr = [self.vocab.stoi[x] for x in arr]
KeyError: 'get battlefield num scream mic cunt'
here is the hole code
import torchtext.data as data
import torch
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
TEXT = data.Field(batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)
feilds = [('text', TEXT), ('target', LABEL)]
train, val = data.TabularDataset(path='processed_data.csv', format='csv', fields=feilds, skip_header=True).split()
TEXT.build_vocab(train, min_freq=5, vectors='glove.twitter.27B.50d')
LABEL.build_vocab(train)
train_set, val_set = data.BucketIterator.splits((train, val), batch_sizes=(32, 32), sort_within_batch=True,
device='cpu', sort_key=lambda x: len(x.text))
class Net(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
super().__init__()
self.Embedding = nn.Embedding(vocab_size, embedding_dim)
self.LSTM = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
self.fc = nn.Linear(hidden_dim * 2, output_dim)
self.act = nn.Sigmoid()
def forward(self, text, text_length):
x = self.Embedding(text)
x = nn.utils.rnn.pack_padded_sequence(x, text_length, True)
x, (state, cell) = self.LSTM(x)
x = torch.cat((state[-2, :, :], state[-1, :, :]), dim=1)
x = self.fc(x)
return self.act(x)
size_of_vocab = len(TEXT.vocab)
embedding_dim = 50
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2
#instantiate the model
model = Net(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers,
bidirectional = True, dropout = dropout)
# Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.Embedding.weight.data.copy_(pretrained_embeddings)
import torch.optim as optim
# define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()
def binary_accuracy(preds, y):
# round predictions to the closest integer
rounded_preds = torch.round(preds)
correct = (rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc
def train(model, iterator, optimizer, criterion):
# initialize every epoch
epoch_loss = 0
epoch_acc = 0
# set the model in training phase
model.train()
for batch in iterator:
# resets the gradients after every batch
optimizer.zero_grad()
# retrieve text and no. of words
text, text_lengths = batch.text
# convert to 1D tensor
predictions = model(text, text_lengths).squeeze()
# compute the loss
loss = criterion(predictions, batch.label)
# compute the binary accuracy
acc = binary_accuracy(predictions, batch.label)
# backpropage the loss and compute the gradients
loss.backward()
# update the weights
optimizer.step()
# loss and accuracy
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
# initialize every epoch
epoch_loss = 0
epoch_acc = 0
# deactivating dropout layers
model.eval()
# deactivates autograd
with torch.no_grad():
for batch in iterator:
# retrieve text and no. of words
text, text_lengths = batch.text
# convert to 1d tensor
predictions = model(text, text_lengths).squeeze()
# compute loss and accuracy
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
# keep track of loss and accuracy
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
best_valid_loss = float('inf')
for epoch in range(20):
# train the model
train_loss, train_acc = train(model, train_set, optimizer, criterion)
# evaluate the model
valid_loss, valid_acc = evaluate(model, val_set, criterion)
# save the best model
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'saved_weights.pt')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%')
the problem is from the evaluate()
function.
I will be pleasure to hear any opinions .
thanks