Why is my eval accuracy so low?

I’m running a super simple feed forward neural network on a dataset consisting of 128 features over 50 classes. These features are sentence embeddings I’ve extracted from a BERT model where I’m already reaching 0.73 accuracy.

During training, the loss goes down consistently and the accuracy on the training set is usually at least 0.40 depending on what I set the hyperparameters at.

But the accuracy on the dev set is always extremely low. In most cases it’s usually <0.02, which is worse than random guessing.

Things I’ve tried:

  • Learning rates of [0.1, 0.01, 0.001, 0.0005, 0.0001, 0.00001]
  • Adding extra layers in the neural net
  • Using sigmoid and tanh instead of ReLU
  • Removing dropout layer
  • Adding batch norm layer
  • SGD instead of Adam

Code:

class NeuralNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(128, 80)
        self.output = nn.Linear(80, 50)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)

    
    def forward(self, x):
        x = self.hidden1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.output(x)
        return x

nnet = NeuralNet()

loss_function = nn.CrossEntropyLoss()

optimizer = optim.Adam(nnet.parameters(), lr=0.001)

nnet.double().to(device)

# Create the DataLoader for our training set.

train_data = TensorDataset(train_features, train_labels)

train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(test_features, test_labels)

validation_sampler = SequentialSampler(validation_data)

validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)


epochs = 100

for epoch in range(1, epochs+1):
    nnet.train()
    epoch_loss = 0
    epoch_acc = 0
    for batch in train_dataloader:
        features = batch[0].to(device)
        labels = batch[1].to(device)
        optimizer.zero_grad()

        outputs = nnet(features)
        vals, inds = torch.max(outputs, dim=1)
        
        loss = loss_function(outputs, labels)
        epoch_loss += loss

        acc = torch.eq(inds, labels).sum().item() / labels.shape[0]
        epoch_acc += acc

        
        loss.backward()
        optimizer.step()
    avg_epoch_loss = epoch_loss / len(train_dataloader)
    avg_epoch_acc = epoch_acc / len(train_dataloader)
    print(f"Epoch: [{epoch}/{epochs}], loss: {avg_epoch_loss}, acc: {avg_epoch_acc}")


nnet.eval()
total_acc = 0
for batch in validation_dataloader:
    features = batch[0].to(device)
    labels = batch[1].to(device)

    with torch.no_grad():
        outputs = nnet(features)
        vals, inds = torch.max(outputs, dim=1)
        loss = loss_function(outputs, labels)
        print(f"Loss: {loss}")
    acc = torch.eq(inds, labels).sum().item() / labels.shape[0]
    total_acc += acc
avg_acc = total_acc / len(validation_dataloader)

print(f'Acc: {avg_acc}')