Multiclass classification using pytorch

I’m new to pytorch, i am doing sentiment analysis,i want to classify reviews into four classes,therefore my code doesn’t return the correct result, so if you can help me to find where is the problem .
Thanks.
model LSTM :

class SentimentLSTM(nn.Module):
   def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.25):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentLSTM, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.25)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.Softmax(dim=1)
        
   def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)
        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        print("lstm ouuuuuuuut",lstm_out)
        
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # softmax function
        softmax_out = self.softmax(out)
        # reshape to be batch_size first
        softmax_out = softmax_out.view(batch_size, -1, output_size)
        softmax_out = softmax_out[:, -1] # get last batch of labels
        # return last sigmoid output and hidden state
        return softmax_out, hidden
    
    
   def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

train data :

parameters :

vocab_size = len(vocab_to_int)+1 
output_size = 4
embedding_dim = 40
hidden_dim = 25

n_layers = 5 

lr=0.001#Taux d'apprentissage
#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=lr)
# training params
epochs = 4 
counter = 0
print_every = 100
clip=5 
net.train()


# train for some number of epochs
for e in range(epochs):   
    # initialize hidden state
    h = net.init_hidden(batch_size)#initialiser les couches cachées

    # batch loop
    for inputs, labels in train_loader:
        counter += 1
        
      
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)
    
        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.long())
        
        loss.backward()
        print(loss)
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, labels = inputs.cpu(), labels.cpu()
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.long())
                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))


# tester les données 

test_losses = [] # track loss
num_correct = 0
# init hidden state
h = net.init_hidden(batch_size)
net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])
   # print("la valeur de h: \n",h)

    inputs, labels = inputs.cpu(), labels.cpu()
    
    # get predicted outputs
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.long())
    test_losses.append(test_loss.item())
    #print(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.argmax(output,dim=1, keepdim=True)
    # compare predictions to true label
    correct_tensor = pred.eq(labels.long().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) 
    num_correct += np.sum(correct)

# -- Statistiques! - ##
# avg test loss
print("Test perte: {:.3f}%".format(np.mean(test_losses)))

# précision sur toutes les données de test
test_acc = num_correct/len(test_loader.dataset)
print("Test de précision: {:.3f}%".format(test_acc*100))


def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])
    
    return test_ints

def predict(net, test_review, sequence_length=200):
    
    net.eval()
    # tokenize review
    test_ints = tokenize_review(test_review)
    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
  
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(np.asarray(features))
    batch_size = feature_tensor.size(0)
   

    # initialize hidden state
    h = net.init_hidden(batch_size)
        
    # get the output from the model
    #feature_tensor = feature_tensor.type(torch.LongTensor)
    output, h = net(feature_tensor, h)
    # convert output probabilities to predicted class (0 or 1)
    #pred = torch.argmax(output.squeeze(),dim=1)
    pred = torch.argmax(output.squeeze())
  # print custom response
    if(pred.item()==1):
        print("Positive review detected!")
    elif(pred.item()==0):
        print("Negative review detected.")
    elif(pred.item()==2):
        print("conflet review detected.")
    else:
        print("neutre review detected.")
                
   
test_review_neg="Very good quality and well made" 
seq_length=400
predict(net, test_review_neg, seq_length)

Double post from here with potential answer.

Thank you so mutch for your answer.