I’m running a super simple feed forward neural network on a dataset consisting of 128 features over 50 classes. These features are sentence embeddings I’ve extracted from a BERT model where I’m already reaching 0.73 accuracy.
During training, the loss goes down consistently and the accuracy on the training set is usually at least 0.40 depending on what I set the hyperparameters at.
But the accuracy on the dev set is always extremely low. In most cases it’s usually <0.02, which is worse than random guessing.
Things I’ve tried:
- Learning rates of
[0.1, 0.01, 0.001, 0.0005, 0.0001, 0.00001]
- Adding extra layers in the neural net
- Using sigmoid and tanh instead of ReLU
- Removing dropout layer
- Adding batch norm layer
- SGD instead of Adam
Code:
class NeuralNet(nn.Module):
def __init__(self):
super().__init__()
self.hidden1 = nn.Linear(128, 80)
self.output = nn.Linear(80, 50)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.5)
def forward(self, x):
x = self.hidden1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.output(x)
return x
nnet = NeuralNet()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(nnet.parameters(), lr=0.001)
nnet.double().to(device)
# Create the DataLoader for our training set.
train_data = TensorDataset(train_features, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)
validation_data = TensorDataset(test_features, test_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)
epochs = 100
for epoch in range(1, epochs+1):
nnet.train()
epoch_loss = 0
epoch_acc = 0
for batch in train_dataloader:
features = batch[0].to(device)
labels = batch[1].to(device)
optimizer.zero_grad()
outputs = nnet(features)
vals, inds = torch.max(outputs, dim=1)
loss = loss_function(outputs, labels)
epoch_loss += loss
acc = torch.eq(inds, labels).sum().item() / labels.shape[0]
epoch_acc += acc
loss.backward()
optimizer.step()
avg_epoch_loss = epoch_loss / len(train_dataloader)
avg_epoch_acc = epoch_acc / len(train_dataloader)
print(f"Epoch: [{epoch}/{epochs}], loss: {avg_epoch_loss}, acc: {avg_epoch_acc}")
nnet.eval()
total_acc = 0
for batch in validation_dataloader:
features = batch[0].to(device)
labels = batch[1].to(device)
with torch.no_grad():
outputs = nnet(features)
vals, inds = torch.max(outputs, dim=1)
loss = loss_function(outputs, labels)
print(f"Loss: {loss}")
acc = torch.eq(inds, labels).sum().item() / labels.shape[0]
total_acc += acc
avg_acc = total_acc / len(validation_dataloader)
print(f'Acc: {avg_acc}')