RNN training loss doesn't improve only after 300 epochs or more!

I am working on a text classification problem with a binary output of 0 or 1. I am rather new to both NLP and Pytorch. I am just trying to build a simple RNN model character level. But the losses don’t seem to even overfit.

My problem is that the accuracy, train loss and test loss remain the same during all epochs.
I tried to increase the number of epochs though, and the results are strange. The model’s loss only sometimes improves after 300 epochs and then starts to fluctuate, but mostly it just settles on the loss it started with the beginning (meaning that it doesn’t improve (the numbers change slightly though but fluctuating). When the loss decreases after 300 epochs, the test accuracy is about 90%.

I have tried adding an activation function, gradient clipping, change to GRU or LSTM, but the same problem exists!

class TagsRNN(nn.Module):
    """
    The RNN model that will be used to perform classification.

    I think seq_len should be 100 (number of words in a sentence)
    """
    def __init__(self, input_size, hidden_dim, n_layers, output_size):    
        """
        Initialize the model by setting up the layers
        """
        super().__init__()
        self.output_size=output_size
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim
        self.input_size=input_size
        
        #Embedding and LSTM layers
        # self.embedding=nn.Embedding(vocab_size, embedding_dim)
        self.rnn=nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, x):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).to(device)

        out, _ = self.rnn(x, h0)
        # out: batch_size, seq_len, hidden_dim
        out = out[:, -1, :] # to get the last thing in the sequence (to follw the architecture's many-to-one)
        out = self.fc(out)
        
        # out: batch_size, hidden_dim
        # out: batch_size, output_size
        out = self.sigmoid(out)
        return out

sequence_length
input_size = len(all_letters) + 2 # +2 for padding and unknown
n_layers = 5
hidden_size = 100
output_nodes = 1

modelRNN = TagsRNN(input_size,hidden_size, n_layers, output_nodes).to(device)

def train(train_loader, val_loader, model, criterion, optimizer, n_epochs):
    all_train_losses = []
    all_val_losses = []

    for epoch in range(n_epochs):
        train_losses = []
        val_losses = []
        for i, (x, labels) in enumerate(train_loader):  
            # shape of x: (batch_size, seq_len) => (N, 100)
            # resize it to (batch_size, seq_len, input_size) => (N, 100, 65)
            x = F.one_hot(x, num_classes=input_size).type(torch.FloatTensor).to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(x)
            loss = criterion(outputs.squeeze(), labels.type(torch.FloatTensor).to(device))
        
            # backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if (epoch+1) % 10 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                    .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()), end=' ')
        train_losses.append(loss.item())
        
        # validate
        with torch.no_grad():
            for i, (x, labels) in enumerate(val_loader):  
                # shape of x: (batch_size, seq_len) => (N, 100)
                # resize it to (batch_size, seq_len, input_size) => (N, 100, 65)
                x = F.one_hot(x, num_classes=input_size).type(torch.FloatTensor).to(device)
                labels = labels.to(device)

                # Forward pass
                outputs = model(x)
                loss = criterion(outputs.squeeze(), labels.type(torch.FloatTensor).to(device))
                
                val_losses.append(loss.item())
        if (epoch+1) % 10 == 0:
            print(',Val Loss: {:.4f}'.format(np.mean(val_losses)))
        all_val_losses.append(np.mean(val_losses))
        all_train_losses.append(np.mean(train_losses))
    return all_train_losses, all_val_losses
        
    
batch_size = 64
learning_rate = 0.001
num_epochs = 500

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(modelRNN.parameters(), lr=learning_rate)

all_train_losses, all_val_losses = train(train_loader, val_loader, modelRNN, criterion, optimizer, num_epochs)

Hey,

Try a lower learning rate for start.