I am trying this in IMDB dataset here is my result as a kaggle notebook
I am trying to make a many-to-one LSTM model. And I am using bert-base-uncased
tokenizer. But the weird thing is, I can overfit on one single batch. But the model does not learn the entire dataset. Can you guys hint me what is wrong here?
Here is the dataset:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')
class ImdbDataset(torch.utils.data.Dataset):
def __init__(self, df, tokenizer):
self.df = df
self.tokenizer = tokenizer
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
review = self.df.iloc[idx]["review"]
label = self.df.iloc[idx]["sentiment"]
tokens = self.tokenizer(review, padding="max_length", add_special_tokens=False, truncation=True, max_length=256, return_tensors="pt")
label = torch.tensor(1 if label == "positive" else 0)
return tokens["input_ids"][0], label
Here is the model:
class Model(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.rnn = nn.LSTM(
embedding_dim,
hidden_dim,
num_layers=n_layers,
dropout=0.2,
batch_first=True
)
self.fc = nn.Linear(hidden_dim, output_dim)
self.dropout = nn.Dropout(0.3)
def forward(self, x):
x = self.embedding(x)
x, _ = self.rnn(x)
x = x[:, -1, :]
x = self.dropout(x)
x = self.fc(x)
x = x.squeeze() # (batch_size, 1) -> (batch_size)
x = torch.sigmoid(x)
return x
model = Model(vocab_size=len(tokenizer), embedding_dim=400, hidden_dim=128, output_dim=1, n_layers=2)
And here is the training loop, if that helps:
epochs = 8
clip = 5 # what is this number ?
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
history = {
'train_loss': [],
'train_acc': [],
'val_loss': [],
'val_acc': [],
}
for epoch in range(epochs):
train_loss = 0
train_acc = 0
model.train()
for i, (inputs, labels) in enumerate(train_dataloader):
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
output = model(inputs)
loss = criterion(output, labels.float())
train_loss += loss.item()
train_acc += torch.sum(torch.round(output) == labels).item()
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
if i % 100 == 0:
print(f"Train Epoch: {epoch}, Iteration: {i}, Loss: {loss.item()}, ")
train_loss /= len(train_dataloader)
train_acc /= len(train_dataloader.dataset)
val_loss = 0
val_acc = 0
model.eval()
for i, (inputs, labels) in enumerate(test_dataloader):
inputs, labels = inputs.to(device), labels.to(device)
output = model(inputs)
loss = criterion(output, labels.float())
val_loss += loss.item()
val_acc += torch.sum(torch.round(output) == labels).item()
if i % 100 == 0:
print(f"Valid Epoch: {epoch}, Iteration: {i}, Loss: {loss.item()}, ")
val_loss /= len(test_dataloader)
val_acc /= len(test_dataloader.dataset)
history['train_loss'].append(train_loss)
history['train_acc'].append(train_acc)
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
Thanks for reading.