Transformer cannot predict next number in linear sequence

I am experimenting with the trasnformer to see if it can predict the next number in a linear sequence. For example, given this training data (1,2,3,4…100) I expect 101 as output. However, the loss is not decreasing as I train the model for some reason. Does anyone know if I am doing something fundamentally wrong here?

My transformer has 1 embed dim, 1 head, and 10 encoders.

import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

import torch.optim as optim
import random

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
now = datetime.now()
logdir = "runs/" + now.strftime("%Y%m%d-%H%M%S") + "/"
writer = SummaryWriter(logdir)


class MyModel(nn.Module):
    def __init__(self, seq_len):
        super().__init__()
        self.lin = nn.Linear(seq_len,1)
        self.tgt = torch.rand(seq_len, 1).to(device)
        self.trans = nn.Transformer(nhead=1, num_encoder_layers=10, d_model=1)

    def forward(self, x):
        y = self.trans(x, self.tgt)
        y = torch.transpose(y,0,1)
        out = self.lin(y)
        return out


model = MyModel(100)
model.to(device)
model.eval()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

running_loss = 0.0

for epoch in range(4000):  # loop over the dataset multiple times
    idx = torch.randint(100, (1,)).item() #Get random number
    training = torch.range(idx, 100+idx, 1).to(device) #Get consecutive numbers starting from randidx
    training = torch.unsqueeze(training, 1)
    labels = torch.tensor([101+idx]).to(device) #Label is number after training sequence
    optimizer.zero_grad()
    
    outputs = model(training)
    outputs = torch.transpose(outputs, 0, 1)
    loss = torch.norm(outputs - labels)
    loss.backward()
    optimizer.step()

    # print statistics
    running_loss += loss.item()
    if epoch % 50 == 0:
        print(f'[{epoch + 1}] loss: {running_loss / 50:.3f}')
        writer.add_scalar('training loss',
                        running_loss / 50,
                        epoch)
        writer.add_scalar('prediction',
                        outputs,
                        epoch)
        writer.add_scalar('labels',
                        labels,
                        epoch)

        running_loss = 0.0

print('Finished Training')