LSTM model output doesn't match input data distribution

I’m trying to build an LSTM just to test it out. I used data with an equal distribution of consecutive alphabet letters starting with ‘A’ , ‘B’ ‘C’ and ‘D’.

So 25% of the data started with ‘ABC…’, 25% started with ‘BCD…’, 25% started with ‘CDE…’, and 25% started with ‘DEF…’.
All of the strings were of length 10 as well.
This was the model architecture I used:
(AA_LIST is a list of the letters of the alphabet)

class LSTM(nn.Module):
def __init__(self, device, inputSz = len(AA_LIST) + 2, hiddenSz = 32, out = len(AA_LIST) + 2, batchSz = 1):
        super().__init__()
        self.hiddenSz = hiddenSz
        self.embedA = nn.Embedding(inputSz, hiddenSz)
      
        self.lstm = nn.LSTM(input_size = hiddenSz, hidden_size = hiddenSz, bidirectional = False)
        self.linear = nn.Linear(hiddenSz, out)
        self.softmax = nn.Softmax(dim = 1)

        self.hid = (
            torch.zeros(1, batchSz, hiddenSz).to(device),
            torch.zeros(1, batchSz, hiddenSz).to(device)
        )
def forward(self, seq, hidden):
        out = self.embedA(seq)
        out, hidden = self.lstm(out.view(out.shape[1], out.shape[0], -1), hidden)
        out = self.linear(out)
        return self.softmax(out[out.shape[0]-1]), hidden

This was how I turned each sequence into data for the LSTM:
All the letters of the alphabet have a corresponding index in the AA_LIST.
“ABCDEFG” was turned into [0, 1], [0, 1, 2], [0, 1, 2, 3] … until the end of the sequence. This was done for all of the sequences.
I used 0 as the “start” token and 27 as the “end” token.
This was the DataLoader code:
(interData is a function that turns each sequence into data using the above method)

from torch.utils.data import Dataset, DataLoader

class AntibodyData(Dataset):
    def __init__(self, data, windowSize: int, device: torch.device):
        if (windowSize == -1): self.data = data
        else: self.data = windowData(data, windowSize, device)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx: int):
        return self.data[idx]

dataloader = DataLoader(
    dataset=AntibodyData(data=interData(h1data[0:datalen-valSz], cpu), windowSize=-1, device=device),
    batch_size=batchSize,
    shuffle=True,
    drop_last=True,
    num_workers = 8,
)
valloader = DataLoader(
    dataset=AntibodyData(data=interData(h1data[datalen-valSz:datalen], cpu), windowSize=-1, device=device), 
    batch_size=batchSize,
    drop_last=True,
    num_workers = 8
)

This was my training method:

def train(dev: torch.device, lossList: list, dataloader = dataloader):
    model.zero_grad()
    loss = 0
    hidden = model.hid
    bcount = 0
    batchloss = 0
    for batch in dataloader:
        seq, tar = batch
        seq = seq.to(dev)
        tar = tar.to(dev)
        output, hidden = model(seq, hidden)
        batchloss = nn.CrossEntropyLoss()(output, tar[0])
        loss += batchloss
        bcount += 1

    loss.backward()
    optimizer.step()
    lossList.append(loss.item() / bcount)

This was my main loop for each epoch:

import time
import math
from torch.optim.lr_scheduler import ReduceLROnPlateau

epochs = 200
epochLoss = 0
trainLosses = []
valLosses = []
torch.autograd.set_detect_anomaly(True)
saveEvery = 3
scheduler = ReduceLROnPlateau(optimizer = optimizer, mode = "min", patience = 2)

for epoch in range(1, epochs + 1):
    start = time.time()
    print("STARTING EPOCH", epoch)
    train(device, trainLosses)
    valLoss(device, valLosses)

    end = time.time()
    mins = math.floor((end - start) / 60)
    sec = round((end - start) - 60*mins, 1)

    print(f"Epoch {epoch}| Time: {mins}m {sec}s | Loss: {trainLosses[-1]}")
    print(f"Validation Loss: {valLosses[-1]}")
    print(f"Learning rate: {optimizer.param_groups[0]['lr']}\n")

    scheduler.step(valLosses[-1])
    
    if epoch % saveEvery == 0:
        torch.save(model.state_dict(), f'./ABmodels/{round(trainLosses[-1], 3)}_loss.pth')
    epochLoss = 0

The top 5 output probabilities after giving the start token of zero were the following:

"B"       "C"         "end"          "D"             "A"
(2, 0.84), (3, 0.16), (27, 0.00035), (4, 8.006e-05), (1, 4.5e-05)]

Can anyone explain the discrepancy between the model’s output probabilities and the distribution of the dataset? Is something in my pipeline wrong?