How to train a model?

I’m using pytorch to train a model as a time-series preditor

my model look likt htis;

import torch
import torch.nn as nn
import torch.nn.init as init

class AttentionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_heads):
        super(AttentionModel, self).__init__()
        self.linear = nn.Linear(input_size, hidden_size)
        init.kaiming_normal_(self.linear.weight, nonlinearity='linear')
        init.constant_(self.linear.bias, 0)

        self.attention = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True)
        # 如果需要自定义注意力层的初始化,可以在这里进行
        for p in self.attention.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

        self.fc = nn.Linear(hidden_size * 3, 1)
        init.xavier_normal_(self.fc.weight)
        init.constant_(self.fc.bias, 0)

    def forward(self, x):
        # 将输入数据映射到隐藏层
        hidden_states = self.linear(x)
        # 使用MultiheadAttention
        attn_output, _ = self.attention(hidden_states, hidden_states, hidden_states)
        # 取出最后一个时间步的输出作为整个序列的表示
        # context_vector = attn_output[-1, :, :]
        # print(context_vector.size())
        # 全连接层输出
        flt = attn_output.flatten(1)
        # print(flt.size())
        output = self.fc(flt)
        return output

my data looks like this:
tensor([ 4.6767e+04, 2.7280e+04, 4.6868e+04, 4.6706e+04, 4.6800e+04,
1.3182e+08, 6.3223e+07, 1.3514e+03, 2.8179e+03, 4.6779e+04,
4.6784e+04, -4.0732e+01, -3.6797e+01, -4.2207e+01, 4.7963e+01,
-8.5589e+00, 4.7957e+01, -8.5687e+00, -4.2211e+01, -3.6795e+01,
-3.8831e-03, 6.8489e-03, 1.0001e+02, 1.0732e-02, -7.0962e-02,
9.9963e+01, 9.9975e+01, 4.3305e-01, -1.4060e+00, 2.3138e-01,
1.1266e+00, 1.0921e+00, -1.4012e+00, -1.3839e+00, -1.3821e+00,
-1.3997e+00, 6.9584e-01, 7.2648e-01, -1.3307e+00, -1.3038e+00,
-1.2189e+00, -1.3154e+00, -9.6482e-01, -1.3155e+00, -9.6549e-01,
-1.2189e+00, -1.3041e+00, -1.3487e+00, -1.3384e+00, 1.4074e+00,
4.1653e-01, -1.2966e+00, -1.3047e+00, -1.2857e+00]),

my label look like this:

tensor([46810.2383]))

I want to use the three data tot predict the lable. But when I traning my model, the loss is always very big. And I plot the prediction predicted by model and the real label, it looks like this:

test

Traning Loss: 156495051487.5493, Prediction Loss: 289177297578.6667

Can you give me some advices about training?