Same model on scikit-learn does well but fails in Pytorch

Ignore the commented out code.
I created a simple classification model as follows
The classes are 0 and 1 (binary classification)
This model gives a very poor result 58% accuracy whilst the same binaryFeats from vectorToTrain.npy when trained using MLP classifier of scikit-learn gives a very good result with 90% accuracy. I have checked changing every parameter. The weights and loss are also changed with each epoch but results are never getting more than 58.xx%.
Can someone please help me in finding the error Iā€™m making in this code?

torch.manual_seed(123)

class pathLSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, binary_dim, comb_dim, tagset_size):
        super(pathLSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.binary_dim = binary_dim
        self.embedding_dim = embedding_dim
        self.comb_dim = comb_dim

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden = self.init_hidden()
        # The linear layer that maps from hidden state space to tag space
        self.hidden2Path = nn.Linear(hidden_dim, comb_dim)
        self.binary2Path = nn.Linear(binary_dim, comb_dim)

        self.comb2tag = nn.Linear(2*comb_dim, tagset_size)


        self.binary2L = nn.Linear(binary_dim, 100)
        self.L2Tag = nn.Linear(100, tagset_size)
        
        

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, lstm_embedding, feature_embedding):
        
        # lstm_out, self.hidden = self.lstm(lstm_embedding, self.hidden)
        
        # pathEmbedding = self.hidden2Path(self.hidden[0].view(1, self.hidden_dim))
        # binaryEmbedding = self.binary2Path(feature_embedding.view(1, self.binary_dim))

        # combinedEmbedding = torch.cat((pathEmbedding, binaryEmbedding), 1)

        # activate = F.relu(combinedEmbedding)

        # tag_space = self.comb2tag(activate)
        
        l_space = self.binary2L(feature_embedding.view(1, self.binary_dim))
        l_space = F.relu(l_space)
        tag_space = self.L2Tag(l_space)
        tag_scores = F.softmax(tag_space, dim=1)
        
        return tag_scores

# Train the model:

binaryFeats = np.load('./vectorToTrain.npy')
pathFeats = np.load('./pathsToTrain.npy')

testBinaryFeats = np.load('./Test/vectorToTest.npy')
testPathFeats = np.load('./Test/pathsToTest.npy')

EMBEDDING_DIM = 102
HIDDEN_DIM = 100
COMB_DIM = 400
BINARY_DIM = binaryFeats.shape[1] - 2

model = pathLSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, BINARY_DIM, COMB_DIM, 2)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.00001, momentum=0.9, weight_decay=0.001)

for epoch in range(200):
    running_loss = 0.0
    for k in range(binaryFeats.shape[0]):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance

        if k%1 == 0:
            model.zero_grad()
            optimizer.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        # Step 3. Run our forward pass.
        path_in = torch.tensor(pathFeats[k], dtype=torch.float32)
        path_in = path_in.unsqueeze(1)
        binary_in = torch.tensor(binaryFeats[k][:-2], dtype=torch.float32)
        tag_scores = model(path_in, binary_in)
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        output = torch.tensor([binaryFeats[k][-1]], dtype=torch.long) 
        loss = loss_function(tag_scores, output)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if k % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, k + 1, running_loss / 2000))
            running_loss = 0.0       
    
    
    correct = 0
    with torch.no_grad():
        for k in range(testBinaryFeats.shape[0]) :
            path_in = torch.tensor(testPathFeats[k], dtype=torch.float32)
            path_in = path_in.unsqueeze(1)
            binary_in = torch.tensor(testBinaryFeats[k][:-2], dtype=torch.float32)
            tag_scores = model(path_in, binary_in)
            _, pred = torch.max(tag_scores, 1)
            # print tag_scores, pred
            actual = torch.tensor([testBinaryFeats[k][-1]], dtype=torch.long)
            if pred == actual :
                correct += 1
        
    print "Accuracy: ", 100 * float(correct)/float(testBinaryFeats.shape[0])

    # print model.binary2tag.weight

Your loss_function, CrossEntropyLoss expects raw logits and the end of your forward function applies a F.softmax. You should either not do the softmax and just return tag_space, or you should change it to a log_softmax and change your loss_function to NLLLoss.

1 Like