Time series Prediction Transformer

import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
def init(self, d_model, max_len=15):
super(PositionalEncoding, self).init()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer(‘pe’, pe)

def forward(self, x):
    return x + self.pe[:x.size(0), :]

class TransformerPredictor(nn.Module):
def init(self, input_dim, output_dim, d_model, nhead, num_layers, dim_feedforward, dropout=0.1, max_seq_length=15, device=‘cuda’):
super(TransformerPredictor, self).init()
self.device = device
self.d_model = d_model
self.input_embedding = nn.Linear(input_dim, d_model).to(device)
self.pos_encoder = PositionalEncoding(d_model, max_seq_length).to(device)
encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, activation=‘relu’).to(device)
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers, enable_nested_tensor=False).to(device)
decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, activation=‘relu’).to(device)
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers).to(device)
self.fc_layers = nn.Sequential(
nn.Linear(d_model, d_model // 2),
nn.ReLU(),
nn.Linear(d_model // 2, d_model // 4),
nn.ReLU(),
nn.Linear(d_model // 4, d_model // 8),
nn.ReLU(),
nn.Linear(d_model // 8, d_model // 16),
nn.ReLU(),
nn.Linear(d_model // 16, output_dim)
).to(device)
self._reset_parameters()

def forward(self, src, tgt=None, max_len=15):
    src = src.permute(1, 0, 2)
    src = self.input_embedding(src)
    src = self.pos_encoder(src)
    memory = self.transformer_encoder(src)

    if tgt is None:
        tgt = torch.zeros((max_len, src.size(1), self.d_model), device=self.device)
        tgt[0, :, :] = memory[-1, :, :].unsqueeze(0)

        outputs = []
        for i in range(max_len):
            tgt_mask = self.generate_square_subsequent_mask(i+1).to(self.device)
            tgt_emb = self.pos_encoder(tgt[:i+1, :, :])
            output = self.transformer_decoder(tgt_emb, memory, tgt_mask=tgt_mask)
            output_for_next_step = output[-1, :, :]
            if i < max_len - 1:
                tgt[i+1, :, :] = output_for_next_step
            output_transformed = self.fc_layers(output_for_next_step.unsqueeze(0))
            outputs.append(output_transformed)

        outputs = torch.stack(outputs, dim=1).squeeze(2)
        return outputs
    else:
        sos_token = memory[-1, :, :].unsqueeze(0)
        tgt = tgt.permute(1, 0, 2)
        tgt = self.input_embedding(tgt)
        tgt = self.pos_encoder(tgt)
        tgt = torch.cat([sos_token, tgt[:-1,:]], dim=0)
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(0)).to(self.device)
        output = self.transformer_decoder(tgt, memory, tgt_mask=tgt_mask)
        final_output = self.fc_layers(output)
        return final_output.permute(1, 0, 2)

def _reset_parameters(self):
    for p in self.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

def generate_square_subsequent_mask(self, sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

I wrote the code for Transformer time-seris prediction for multiple step prediction.
But the model not performs well.

Traning loss is very low when target is given, but when target is not given, it predicts almost same value or very wrong value