Lstm - minimal example issue

Hi folks,
After reading some tutorials, I did a minimal example aiming to classify (binary) an input sequence:

class LSTM_Seq(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, output_size):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers                
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=n_layers, bidirectional=False)

        self.linear = nn.Linear(hidden_size, output_size)
                
    def init_hidden(self, n_layers, batch_size, hidden_size):
        return (torch.zeros(n_layers, batch_size, hidden_size),   
                torch.zeros(n_layers, batch_size, hidden_size) )
    
    def forward(self, x):        
        # input should be (sequence, batch, number_of_features)
        x = x.permute(1, 0, 2)
        mini_batch = x.size(1)
        self.hidden = self.init_hidden(self.n_layers, mini_batch, self.hidden_size)
        out, self.hidden = self.lstm(x, self.hidden)        
        out = self.linear(out[-1])        
        out = F.log_softmax(out, dim=1)
        return out

model = LSTM_Seq(input_size=2, hidden_size=100, n_layers=1, output_size=2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# %% 
torch.manual_seed(123)

# 10000 samples of  sequence dims= 500, number of features=2
x_train = torch.rand(10000, 500, 2).float()
y_train = (torch.rand(10000) > 0.4).long()

batch_size = 32
train_dataset = torch.utils.data.TensorDataset(x_train, y_train) 
train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                               batch_size=batch_size, shuffle=True) # create your dataloader

total_train_batch = len(train_dataloader)
# %%  
for epoch in range(1):
    running_loss = 0.0
    
    iter_train_dataset = iter(train_dataloader)    
    for k in range(total_train_batch):    
        seqs, labels = next(iter_train_dataset)
        
        optimizer.zero_grad()        
        outputs = model(seqs)
        loss = F.nll_loss(outputs, labels)
        loss.backward()
        optimizer.step()
    
        running_loss += loss.item()
        if k % 50 == 0:    # print every  mini-batches
            print('[%d, %3d] loss: %.3f' %
                      (epoch + 1, k + 1, running_loss / 50))
            running_loss = 0.0

However, the model seems not learning, it shows:

[1,   1] loss: 0.014
[1,  51] loss: 0.673
[1, 101] loss: 0.668
[1, 151] loss: 0.665
[1, 201] loss: 0.677
[1, 251] loss: 0.674
[1, 301] loss: 0.680
  1. I verified the loss has a gradient function, the loss should be reduced in some sense.

  2. How the hidden states works, how can I modify my code in order to get

transferred between timestep to timestep

as said by smth and Danya in another forum.