I created a Transformer with zero decoders. Based on the source code, if you have zero decoders, the output should just be the decoder tgt. This is true until I start optimizing the function
The second print output is false, which doesn’t make sense. Let me know if I am doing something wrong or if it’s a bug
import torch
from torch import nn
import torch.optim as optim
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
batch=2048
class MyModel(nn.Module):
def __init__(self):
super().__init__()
self.tgt = torch.zeros(1, batch, 1).to(device)
self.trans = nn.Transformer(nhead=1, num_encoder_layers=1, d_model=1, dim_feedforward=1, num_decoder_layers = 0, dropout=0)
def forward(self, x):
y = self.trans(x, self.tgt)
return y
model = MyModel()
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
running_loss = 0.0
upper = 10000000
for epoch in range(2):
training = torch.randint(low=0, high=upper, size=(batch,)).float().to(device) / upper #Get batch of random numbers
training = torch.unsqueeze(training, 1)
training = torch.unsqueeze(training, 0)
print(torch.all(model(training)==model.tgt))
labels = training + 1
labels.to(device) # Learn to add 1
optimizer.zero_grad()
outputs = model(training)
loss = nn.MSELoss()(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
print(f'[{epoch + 1}] loss: {running_loss:.3f}')
running_loss = 0.0
print('Finished Training')