I’m trying to build an LSTM just to test it out. I used data with an equal distribution of consecutive alphabet letters starting with ‘A’ , ‘B’ ‘C’ and ‘D’.

So 25% of the data started with ‘ABC…’, 25% started with ‘BCD…’, 25% started with ‘CDE…’, and 25% started with ‘DEF…’.

All of the strings were of length 10 as well.

This was the model architecture I used:

(AA_LIST is a list of the letters of the alphabet)

```
class LSTM(nn.Module):
def __init__(self, device, inputSz = len(AA_LIST) + 2, hiddenSz = 32, out = len(AA_LIST) + 2, batchSz = 1):
super().__init__()
self.hiddenSz = hiddenSz
self.embedA = nn.Embedding(inputSz, hiddenSz)
self.lstm = nn.LSTM(input_size = hiddenSz, hidden_size = hiddenSz, bidirectional = False)
self.linear = nn.Linear(hiddenSz, out)
self.softmax = nn.Softmax(dim = 1)
self.hid = (
torch.zeros(1, batchSz, hiddenSz).to(device),
torch.zeros(1, batchSz, hiddenSz).to(device)
)
def forward(self, seq, hidden):
out = self.embedA(seq)
out, hidden = self.lstm(out.view(out.shape[1], out.shape[0], -1), hidden)
out = self.linear(out)
return self.softmax(out[out.shape[0]-1]), hidden
```

This was how I turned each sequence into data for the LSTM:

All the letters of the alphabet have a corresponding index in the AA_LIST.

“ABCDEFG” was turned into [0, 1], [0, 1, 2], [0, 1, 2, 3] … until the end of the sequence. This was done for all of the sequences.

I used 0 as the “start” token and 27 as the “end” token.

This was the DataLoader code:

(interData is a function that turns each sequence into data using the above method)

```
from torch.utils.data import Dataset, DataLoader
class AntibodyData(Dataset):
def __init__(self, data, windowSize: int, device: torch.device):
if (windowSize == -1): self.data = data
else: self.data = windowData(data, windowSize, device)
def __len__(self):
return len(self.data)
def __getitem__(self, idx: int):
return self.data[idx]
dataloader = DataLoader(
dataset=AntibodyData(data=interData(h1data[0:datalen-valSz], cpu), windowSize=-1, device=device),
batch_size=batchSize,
shuffle=True,
drop_last=True,
num_workers = 8,
)
valloader = DataLoader(
dataset=AntibodyData(data=interData(h1data[datalen-valSz:datalen], cpu), windowSize=-1, device=device),
batch_size=batchSize,
drop_last=True,
num_workers = 8
)
```

This was my training method:

```
def train(dev: torch.device, lossList: list, dataloader = dataloader):
model.zero_grad()
loss = 0
hidden = model.hid
bcount = 0
batchloss = 0
for batch in dataloader:
seq, tar = batch
seq = seq.to(dev)
tar = tar.to(dev)
output, hidden = model(seq, hidden)
batchloss = nn.CrossEntropyLoss()(output, tar[0])
loss += batchloss
bcount += 1
loss.backward()
optimizer.step()
lossList.append(loss.item() / bcount)
```

This was my main loop for each epoch:

```
import time
import math
from torch.optim.lr_scheduler import ReduceLROnPlateau
epochs = 200
epochLoss = 0
trainLosses = []
valLosses = []
torch.autograd.set_detect_anomaly(True)
saveEvery = 3
scheduler = ReduceLROnPlateau(optimizer = optimizer, mode = "min", patience = 2)
for epoch in range(1, epochs + 1):
start = time.time()
print("STARTING EPOCH", epoch)
train(device, trainLosses)
valLoss(device, valLosses)
end = time.time()
mins = math.floor((end - start) / 60)
sec = round((end - start) - 60*mins, 1)
print(f"Epoch {epoch}| Time: {mins}m {sec}s | Loss: {trainLosses[-1]}")
print(f"Validation Loss: {valLosses[-1]}")
print(f"Learning rate: {optimizer.param_groups[0]['lr']}\n")
scheduler.step(valLosses[-1])
if epoch % saveEvery == 0:
torch.save(model.state_dict(), f'./ABmodels/{round(trainLosses[-1], 3)}_loss.pth')
epochLoss = 0
```

The top 5 output probabilities after giving the start token of zero were the following:

```
"B" "C" "end" "D" "A"
(2, 0.84), (3, 0.16), (27, 0.00035), (4, 8.006e-05), (1, 4.5e-05)]
```

Can anyone explain the discrepancy between the model’s output probabilities and the distribution of the dataset? Is something in my pipeline wrong?