I am working on amazon fine food reviews to create a multiclass LSTM but i am getting error.
ValueError: Expected input batch_size (10000) to match target batch_size (50).
Below is my complete code. Sorry for posting this long code i need your help. It is quite simple but i got stuck.
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("D:/Dataset/amazon-fine-food-reviews/Reviews.csv")
def map_score(score):
if score>3:
return "positive"
elif score == 3:
return "neutral"
elif score<3:
return "negative"
df['sent_polarity'] = df['Score'].apply(lambda x:map_score(x))
score_map ={"neutral":1,"positive":2,"negative":3}
df['class_label'] = df['sent_polarity'].map(score_map)
tmp_group = df.groupby('UserId')['UserId', 'ProductId', 'ProfileName', 'Time', 'Score', 'Text'].size().reset_index(name='counts')
print(f'Shape before duplication: {df.shape}')
no_dupe = df.drop_duplicates(subset={'UserId','ProfileName','Time','Text'},keep='first',inplace=False)
print(f'Shape after deduplication:\n{no_dupe.shape}')
data = no_dupe[no_dupe.HelpfulnessNumerator <= no_dupe.HelpfulnessDenominator]
print(f'Shape after removing false condition:\n{data.shape}')
data.sample(3)
import re
def html_removal(sentence):
pattern = re.compile('<.*?>')
final_sentence = re.sub(pattern,' ',sentence)
return final_sentence
def punctuation_removal(sentence):
final_sentence = re.sub('[^a-zA-Z]',' ',sentence)
return final_sentence
def number_removal(sentence):
final_sentence = re.sub('\S*\d\S*'," ",sentence).strip()
return final_sentence
def url_remover(sentence):
text = re.sub(r"http\S+"," ",sentence)
final_sentence = re.sub(r"www.\S+"," ",text)
return final_sentence
def patterns_removal(sentence):
final_sentence = re.sub("\\s*\\b(?=\\w*(\\w)\\1{2,})\\w*\\b",' ',sentence)
return final_sentence
#function to expand words and convert all words to lower case
def expand(string_of_n_length):
string_of_n_length = str(string_of_n_length).lower()
string_of_n_length = string_of_n_length.replace(",000,000", " m").replace(",000", " k").replace("′", "'").replace("’", "'")\
.replace("won't", " will not").replace("cannot", " can not").replace("can't", " can not")\
.replace("n't", " not")\
.replace("what's", " what is")\
.replace("it's", " it is")\
.replace("'ve", " have")\
.replace("'m", " am")\
.replace("'re", " are")\
.replace("he's", " he is")\
.replace("she's", " she is")\
.replace("'s", " own")\
.replace("%", " percent ")\
.replace("₹", " rupee ")\
.replace("$", " dollar ")\
.replace("€", " euro ")\
.replace("'ll", " will")\
.replace("how's"," how has")\
.replace("y'all"," you all")\
.replace("o'clock"," of the clock")\
.replace("ne'er"," never")\
.replace("let's"," let us")\
.replace("finna"," fixing to")\
.replace("gonna"," going to")\
.replace("gimme"," give me")\
.replace("gotta"," got to")\
.replace("'d"," would")\
.replace("daresn't"," dare not")\
.replace("dasn't"," dare not")\
.replace("e'er"," ever")\
.replace("everyone's"," everyone is")\
.replace("'cause'"," because")
string_of_n_length = re.sub(r"([0-9]+)000000", r"\1m", string_of_n_length)
string_of_n_length = re.sub(r"([0-9]+)000", r"\1k", string_of_n_length)
return string_of_n_length
from nltk.corpus import stopwords
#set of default_stopwords
default_stopwords = set(stopwords.words('english'))
#excluding some useful words from default_stopwords set ---> will be usefull while doing sentiment analysis
excluded_stopwords = set(['against','not','don', "don't",'ain','aren',"aren't",'couldn',"couldn't",'didn', "didn't",'doesn',"doesn't",
'hadn',"hadn't", 'hasn', "hasn't",'haven',"haven't",'isn', "isn't",'mightn',"mightn't",'mustn',"mustn't",
'needn',"needn't",'shouldn', "shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"])
required_stopwords = default_stopwords - excluded_stopwords
#stemming
from nltk.stem import PorterStemmer # can be used
from nltk.stem.snowball import SnowballStemmer
sno_stm = SnowballStemmer(language='english')
# import these modules
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
# a denotes adjective in "pos"
print("Tradition :", lemmatizer.lemmatize("Tradition"))
print("Traditional :", lemmatizer.lemmatize("Traditional"))
def preprocessing(sentence):
proc_1 = html_removal(sentence)
proc_2 = punctuation_removal(proc_1)
proc_3 = number_removal(proc_2)
proc_4 = url_remover(proc_3)
proc_5 = patterns_removal(proc_4)
proc_6 = expand(proc_5)
proc_7 = ' '.join([lemmatizer.lemmatize(word) for word in proc_6.split() if word not in required_stopwords])
return proc_7
tmp_X = data['Text'].apply(preprocessing)
all_text = ' '.join(tmp_X)
all_words = all_text.split()
from collections import Counter
counts = Counter(all_words)
vocab = sorted(counts, key= counts.get,reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
def text_to_int(sentence):
reviews_ints = []
reviews_ints.append([vocab_to_int[word] for word in sentence.split()])
return reviews_ints
X_data = tmp_X.apply(text_to_int)
# outlier review stats
review_lens = Counter([len(x[0]) for x in X_data])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))
print('Number of reviews before removing outliers: ', len(X_data))
## remove any reviews/labels with zero length from the reviews_ints list.
# get indices of any reviews with length 0
non_zero_idx = [ii for ii, review in enumerate(X_data) if len(review[0]) != 0]
# remove 0-length reviews and their labels
reviews_ints = [X_data.iloc[ii] for ii in non_zero_idx]
encoded_labels = np.array([data['class_label'].iloc[ii] for ii in non_zero_idx])
print('Number of reviews after removing outliers: ', len(reviews_ints))
def pad_features(reviews_ints,seq_lengths):
features = np.zeros((len(reviews_ints),seq_lengths),dtype=int)
for i,row in enumerate(reviews_ints):
features[i,-len(row[0]):]=np.array(row[0])[:seq_lengths]
return features
# Test your implementation!
seq_length = 200
features = pad_features(reviews_ints, seq_lengths=seq_length)
## test statements - do not change - ##
assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."
# print first 10 values of the first 30 batches
print(features[:30,170:200])
split_frac =0.8
split_idx = int(len(features)*split_frac)
train_x,remaining_x = features[:split_idx],features[split_idx:]
train_y,remaining_y = encoded_labels[:split_idx],encoded_labels[split_idx:]
test_idx = int(len(remaining_x)*0.5)
val_x,test_x = remaining_x[:test_idx],remaining_x[test_idx:]
val_y,test_y = remaining_y[:test_idx],remaining_y[test_idx:]
## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
"\nValidation set: \t{}".format(val_x.shape),
"\nTest set: \t\t{}".format(test_x.shape))
import torch
from torch.utils.data import TensorDataset, DataLoader
train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x),torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x),torch.from_numpy(test_y))
batch_size=50
train_loader= DataLoader(train_data,shuffle=True,batch_size=batch_size)
valid_loader= DataLoader(valid_data,shuffle=True,batch_size=batch_size)
test_loader= DataLoader(test_data,shuffle=True,batch_size=batch_size)
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
print('Training on GPU.')
else:
print('No GPU available, training on CPU.')
import torch.nn as nn
class SentimentRNN(nn.Module):
def __init__(self,vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
super(SentimentRNN,self).__init__()
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim = hidden_dim
self.embedding = nn.Embedding(vocab_size,embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
self.dropout = nn.Dropout(0.3)
self.fc = nn.Linear(hidden_dim,output_size)
def forward(self,x,hidden):
batch_size= x.size(0)
embeds = self.embedding(x)
lstm_out,lstm_hidden = self.lstm(embeds)
print("lstm_out: ",lstm_out.shape)
lstm_out = lstm_out.contiguous().view(-1,self.hidden_dim)
# print("lstm_out: ",lstm_out.shape)
lstm_out = self.dropout(lstm_out)
print("lstm_out: ",lstm_out.shape)
lstm_out = self.fc(lstm_out)
print("lstm_out: ",lstm_out.shape)
return lstm_out,lstm_hidden
def init_hidden(self,batch_size):
weight = next(self.parameters()).data
if train_on_gpu:
hidden = (weight.new(self.n_layers,batch_size,self.hidden_dim).zero_().cuda(),
weight.new(self.n_layers,batch_size,self.hidden_dim).zero_().cuda())
else:
hidden = (weight.new(self.n_layers,batch_size,self.hidden_dim).zero_(),
weight.new(self.n_layers,batch_size,self.hidden_dim).zero_())
return hidden
vocab_size = len(vocab)+1
output_size = 3
embedding_dim = 400
hidden_dim = 256
n_layers = 2
net = SentimentRNN(vocab_size,output_size,embedding_dim,hidden_dim,n_layers)
print(net)
lr = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(),lr=lr)
epochs = 4
counter =0
print_every=100
clip =5
if train_on_gpu:
net.cuda()
net.train()
for e in range(epochs):
h = net.init_hidden(batch_size)
for inputs,labels in train_loader:
counter+=1
if train_on_gpu:
inputs,labels = inputs.cuda(), labels.cuda()
h = tuple([each.data for each in h])
net.zero_grad()
output,h = net(inputs,h)
print("inputs: ",inputs.shape)
print("output: ",output.shape)
print("labels: ",labels.shape)
print("labels reshape:", labels.shape)
loss = criterion(output,labels.float())
loss.backward()
nn.utils.clip_grad_norm(net.parameters(),clip)
optimizer.step()
if counter%print_every ==0:
val_h = init_hidden(batch_size)
val_losses=[]
net.eval()
for inputs,labels in valid_loader:
val_h = tuple([each.data for each in val_h])
if train_on_gpu:
inputs,labels = inputs.cuda(),labels.cuda()
output,val_h = net(inputs,val_h)
val_loss = criterion(output.squeeze(),labels.float())
val_losses.append(val_loss.item())
net.train()
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))