Optimizing Transformer bug

I created a Transformer with zero decoders. Based on the source code, if you have zero decoders, the output should just be the decoder tgt. This is true until I start optimizing the function

The second print output is false, which doesn’t make sense. Let me know if I am doing something wrong or if it’s a bug

import torch
from torch import nn
import torch.optim as optim
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

class MyModel(nn.Module):
    def __init__(self):
        self.tgt = torch.zeros(1, batch, 1).to(device)
        self.trans = nn.Transformer(nhead=1, num_encoder_layers=1, d_model=1, dim_feedforward=1, num_decoder_layers = 0, dropout=0)
    def forward(self, x):
        y = self.trans(x, self.tgt)
        return y

model = MyModel()
optimizer = optim.Adam(model.parameters(), lr=0.01)
running_loss = 0.0
upper = 10000000

for epoch in range(2):
    training = torch.randint(low=0, high=upper, size=(batch,)).float().to(device) / upper #Get batch of random numbers
    training = torch.unsqueeze(training, 1)
    training = torch.unsqueeze(training, 0)


    labels = training + 1
    labels.to(device) # Learn to add 1
    outputs = model(training)
    loss = nn.MSELoss()(outputs, labels)
    # print statistics
    running_loss += loss.item()
    print(f'[{epoch + 1}] loss: {running_loss:.3f}')
    running_loss = 0.0
print('Finished Training')