RuntimeError: The size of tensor a (307) must match the size of tensor b (12) at non-singleton dimension 2

i have been working on changing how DSTGN(Dynamic spatio-temporal graph network with adaptive propagation mechanism) model handles temporal dependencies the original model uses dilated temporal convolution, i tried to change it to use multi head self attention with residual connection from DSTAGNN(Dynamic Spatial-Temporal Aware Graph Neural Network) but when i try to train it on PEMSd4 dataset which has dimensions of (16992, 307, 3) containing 16992 time steps for 307 nodes, with 3 features per time step i get the following error.

Namespace(device='cuda:0', dataset='PEMSD4', gcn_bool=True, addaptadj=True, seq_length=12, nhid=32, in_dim=1, num_nodes=307, batch_size=32, learning_rate=0.001, dropout=0.3, weight_decay=0.0, epochs=200, print_every=50, seed=1734828042, save='/kaggle/working/Methodology/results', expid=1, log_file='/kaggle/working/Methodology/results/pems4_log', embed_dim=10, rate=1, dropout_ingc=0.3, eta=1, gamma=0.01, order=0.1, moco=0.1, layers=3, column_wise=False, test_ratio=0.2, val_ratio=0.2, lag=12, horizon=12, tc_dropout=0.0, dilation_exponential=1, d_model=64, n_heads=8)
Load PEMSD4 Dataset shaped:  (16992, 307, 1) 919.0 0.0 211.7007794815878 180.0
Normalize the dataset by Standard Normalization
Train:  (10173, 12, 307, 1) (10173, 12, 307, 1)
Val:  (3375, 12, 307, 1) (3375, 12, 307, 1)
Test:  (3375, 12, 307, 1) (3375, 12, 307, 1)
/kaggle/working/Methodology/code/DSTGN/dataloader.py:81: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /usr/local/src/pytorch/torch/csrc/tensor/python_tensor.cpp:78.)
  X, Y = TensorFloat(X), TensorFloat(Y)
load dataset done
Namespace(device='cuda:0', dataset='PEMSD4', gcn_bool=True, addaptadj=True, seq_length=12, nhid=32, in_dim=1, num_nodes=307, batch_size=32, learning_rate=0.001, dropout=0.3, weight_decay=0.0, epochs=200, print_every=50, seed=1734828042, save='/kaggle/working/Methodology/results', expid=1, log_file='/kaggle/working/Methodology/results/pems4_log', embed_dim=10, rate=1, dropout_ingc=0.3, eta=1, gamma=0.01, order=0.1, moco=0.1, layers=3, column_wise=False, test_ratio=0.2, val_ratio=0.2, lag=12, horizon=12, tc_dropout=0.0, dilation_exponential=1, d_model=64, n_heads=8)
/opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:60: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
  warnings.warn(
start training...
 x: torch.Size([32, 13, 307, 32])
self.pos_embed(pos): torch.Size([32, 32, 12, 64])
Traceback (most recent call last):
  File "/kaggle/working/Methodology/code/DSTGN/train_pems4.py", line 188, in <module>
    main()
  File "/kaggle/working/Methodology/code/DSTGN/train_pems4.py", line 99, in main
    metrics = engine.train(trainx, trainy, pred_time_embed=None, iter=iter)
  File "/kaggle/working/Methodology/code/DSTGN/engine.py", line 37, in train
    output, gl_loss, middle_pred, _ = self.model(input, pred_time_embed)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/kaggle/working/Methodology/code/DSTGN/model_2.py", line 151, in forward
    x = self.temporal_embedding(x, x.size(0))
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/kaggle/working/Methodology/code/DSTGN/DSTAGNN_my.py", line 67, in forward
    embedding = x + self.pos_embed(pos)
RuntimeError: The size of tensor a (307) must match the size of tensor b (12) at non-singleton dimension 2

this is the code for the MHSA

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k, num_of_d):
        super(ScaledDotProductAttention, self).__init__()
        self.d_k = d_k
        self.num_of_d = num_of_d

    def forward(self, Q, K, V, attn_mask, res_att):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(self.d_k) + res_att
        if attn_mask is not None:
            scores.masked_fill_(attn_mask, -1e9)
        attn = F.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        return context, scores

class MultiHeadAttention(nn.Module):
    def __init__(self, DEVICE, d_model, d_k, d_v, n_heads, num_of_d):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        self.num_of_d = num_of_d
        self.DEVICE = DEVICE
        self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
        self.fc = nn.Linear(n_heads * d_v, d_model, bias=False)

    def forward(self, input_Q, input_K, input_V, attn_mask, res_att=None):
        residual, batch_size = input_Q, input_Q.size(0)
        Q = self.W_Q(input_Q).view(batch_size, self.num_of_d, -1, self.n_heads, self.d_k).transpose(2, 3)
        K = self.W_K(input_K).view(batch_size, self.num_of_d, -1, self.n_heads, self.d_k).transpose(2, 3)
        V = self.W_V(input_V).view(batch_size, self.num_of_d, -1, self.n_heads, self.d_v).transpose(2, 3)

        if attn_mask is not None:
            attn_mask = attn_mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1)

        context, res_attn = ScaledDotProductAttention(self.d_k, self.num_of_d)(Q, K, V, attn_mask, res_att)
        context = context.transpose(2, 3).reshape(batch_size, self.num_of_d, -1, self.n_heads * self.d_v)
        output = self.fc(context)
        return nn.LayerNorm(self.d_model).to(self.DEVICE)(output + residual), res_attn

class Embedding(nn.Module):
    def __init__(self, nb_seq, d_Em, num_of_features, Etype):
        super(Embedding, self).__init__()
        self.nb_seq = nb_seq
        self.Etype = Etype
        self.num_of_features = num_of_features
        self.pos_embed = nn.Embedding(nb_seq, d_Em)
        self.norm = nn.LayerNorm(d_Em)
        self.projection_layer = nn.Linear(32, 64)
    def forward(self, x, batch_size):
       
        
        
        if self.Etype == 'T':
            pos = torch.arange(self.nb_seq, dtype=torch.long).to(x.device)
            pos = pos.unsqueeze(0).unsqueeze(0).expand(batch_size, self.num_of_features, self.nb_seq)
            print(f" x: {x.shape}")
            print(f"self.pos_embed(pos): {self.pos_embed(pos).shape}")
            x = self.projection_layer(x)
            embedding = x + self.pos_embed(pos)
            
        Emx = self.norm(embedding)
        return Emx
            
        # else:
        #     pos = torch.arange(self.nb_seq, dtype=torch.long).to(x.device)
        #     pos = pos.unsqueeze(0).expand(batch_size, self.nb_seq)
        #     print(f"Shape pos: {pos.shape}")
        #     embedding = x + self.pos_embed(pos)
        

def make_model(DEVICE, num_of_d, nb_block, in_channels, K,
               nb_chev_filter, nb_time_filter, time_strides, adj_mx, adj_pa,
               adj_TMD, num_for_predict, len_input, num_of_vertices, d_model, d_k, d_v, n_heads):
    L_tilde = scaled_Laplacian(adj_mx)
    cheb_polynomials = [torch.from_numpy(i).type(torch.FloatTensor).to(DEVICE) for i in cheb_polynomial(L_tilde, K)]
    model = DSTAGNN_submodule(DEVICE, num_of_d, nb_block, in_channels,
                             K, nb_chev_filter, nb_time_filter, time_strides, cheb_polynomials,
                             adj_pa, adj_TMD, num_for_predict, len_input, num_of_vertices, d_model, d_k, d_v, n_heads)

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
        else:
            nn.init.uniform_(p)

    return model

and this is the code for the model

import torch
import torch.nn as nn
import torch.nn.functional as F
from graph_constuct_fgwn_2 import graph_constructor
from layer_mtgnn import *
import numpy as np
from DSTAGNN_my import MultiHeadAttention, ScaledDotProductAttention, Embedding

class dnconv(nn.Module):
    def __init__(self):
        super(dnconv, self).__init__()

    def forward(self, x, A):
        if len(A.size()) == 2:
            A = A.unsqueeze(0).repeat(x.shape[0], 1, 1)
        x = torch.einsum('nvw, ncwl->ncvl', [A, x])
        return x.contiguous()

class linear(nn.Module):
    def __init__(self, c_in, c_out):
        super(linear, self).__init__()
        self.mlp = torch.nn.Conv2d(c_in, c_out, kernel_size=(1, 1), padding=(0, 0), stride=(1, 1), bias=True)

    def forward(self, x):
        return self.mlp(x)

class gcn_modify(nn.Module):
    def __init__(self, c_in, c_out, dropout, support_len=3, order=2):
        super(gcn_modify, self).__init__()
        self.nconv = dnconv()
        c_in = (order * support_len + 1) * c_in
        self.mlp = linear(c_in, c_out)
        self.dropout = dropout
        self.order = order

    def forward(self, x, support):
        out = [x]
        x1 = self.nconv(x, support)
        out.append(x1)
        for k in range(2, self.order + 1):
            x2 = self.nconv(x1, support)
            out.append(x2)
            x1 = x2

        h = torch.cat(out, dim=1)
        h = self.mlp(h)
        h = F.dropout(h, self.dropout, training=self.training)
        return h

class gwnet(nn.Module):
    def __init__(self, device, num_nodes, dropout=0.3, gcn_bool=True, addaptadj=True, seq_length=12,
                 in_dim=1, out_dim=12, residual_channels=64, dilation_channels=64, skip_channels=64, end_channels=128,
                 layers=2, embed_dim=10, dropout_ingc=0.5, eta=1, gamma=0.001,
                 m=0.9, highway=False, batch_size=64, tc_dropout=0.1, n_heads=8, d_model=64):
        super(gwnet, self).__init__()
        self.dropout = dropout
        self.layers = layers
        self.gcn_bool = gcn_bool
        self.addaptadj = addaptadj

        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()
        self.residual_convs = nn.ModuleList()
        self.skip_convs = nn.ModuleList()
        self.bn = nn.ModuleList()
        self.gconv = nn.ModuleList()
        self.gconv_1 = nn.ModuleList()
        self.norm = nn.ModuleList()

        self.attention_layers = nn.ModuleList()
        self.d_model = d_model
        self.n_heads = n_heads

        self.start_conv = nn.Conv2d(in_channels=in_dim,
                                    out_channels=residual_channels,
                                    kernel_size=(1, 1))
        self.seq_length = seq_length
        self.receptive_field = layers * 4 + 1

        self.temporal_embedding = Embedding(seq_length, d_model, residual_channels, 'T')

        for i in range(layers):
            self.attention_layers.append(
                MultiHeadAttention(device, d_model, d_model // n_heads, d_model // n_heads, n_heads, num_nodes)
            )

            self.residual_convs.append(nn.Conv2d(in_channels=d_model,
                                                 out_channels=residual_channels,
                                                 kernel_size=(1, 1)))
            
            if self.seq_length > self.receptive_field:
                self.skip_convs.append(nn.Conv2d(in_channels=d_model,
                                                 out_channels=skip_channels,
                                                 kernel_size=(1, self.seq_length - self.receptive_field + 1)))
            else:
                self.skip_convs.append(nn.Conv2d(in_channels=d_model,
                                                 out_channels=skip_channels,
                                                 kernel_size=(1, 1)))

            if self.gcn_bool:
                self.gconv.append(gcn_modify(d_model, residual_channels, dropout, support_len=1, order=2))
                self.gconv_1.append(gcn_modify(d_model, residual_channels, dropout, support_len=1, order=2))

            if self.seq_length > self.receptive_field:
                self.norm.append(LayerNorm((residual_channels, num_nodes, self.seq_length - self.receptive_field + 1),
                                           elementwise_affine=True))
            else:
                self.norm.append(LayerNorm((residual_channels, num_nodes, 1),
                                           elementwise_affine=True))

        self.end_conv_1 = nn.Conv2d(in_channels=skip_channels,
                                    out_channels=end_channels,
                                    kernel_size=(1, 1),
                                    bias=True)

        self.end_conv_2 = nn.Conv2d(in_channels=end_channels,
                                    out_channels=out_dim,
                                    kernel_size=(1, 1),
                                    bias=True)

        self.skip0 = nn.Conv2d(in_channels=in_dim, out_channels=skip_channels, kernel_size=(1, 1), bias=True)
        self.skipE = nn.Conv2d(in_channels=residual_channels, out_channels=skip_channels, kernel_size=(1, 1), bias=True)

        self.idx = torch.arange(num_nodes).to(device)

        self.graph_construct = graph_constructor(num_nodes, embed_dim, device, seq_length, eta=eta, in_dim=in_dim,
                                                 gamma=gamma, dropout=dropout_ingc, m=m, batch_size=batch_size)

    def forward(self, input, pred_time_embed=None):
        in_len = input.size(3)
        if in_len < self.receptive_field:
            x = nn.functional.pad(input, [self.receptive_field - in_len, 0, 0, 0])
        else:
            x = input

        new_supports = None
        gl_loss = None
        dy_adj = None
        adj_norm = None
        if self.gcn_bool:
            adp, resolution_static, node_embed, gl_loss_from, dy_nodeEmbed, adj_norm = self.graph_construct(input)
            gl_loss = gl_loss_from
            new_supports = resolution_static
            dy_adj = adp

        skip = self.skip0(F.dropout(x, self.dropout, training=self.training))
        x = self.start_conv(x)

        # Apply temporal embedding
        x = x.permute(0, 3, 2, 1)  # [batch_size, seq_length, num_nodes, channels]
        x = self.temporal_embedding(x, x.size(0))
        x = x.permute(0, 3, 2, 1)  # [batch_size, channels, num_nodes, seq_length]

        for i in range(self.layers):
            residual = x
            
            # Apply attention instead of dilated convolution
            x = x.permute(0, 3, 2, 1)  # [batch_size, seq_length, num_nodes, channels]
            x, _ = self.attention_layers[i](x, x, x, None)
            x = x.permute(0, 3, 2, 1)  # [batch_size, channels, num_nodes, seq_length]
            
            x = self.residual_convs[i](x)
            x = x + residual[:, :, :, -x.size(3):]
            x = self.norm[i](x, self.idx)
            
            s = self.skip_convs[i](x)
            skip = s + skip

            if self.gcn_bool:
                x_1 = self.gconv_1[i](x, dy_adj)
                x = x_1

        skip = self.skipE(x) + skip
        x = F.relu(skip)
        x = F.relu(self.end_conv_1(x))
        x = self.end_conv_2(x)
        return x, gl_loss, None, dy_adj

def make_model(DEVICE, num_of_d, nb_block, in_channels, K,
               nb_chev_filter, nb_time_filter, time_strides, adj_mx, adj_pa,
               adj_TMD, num_for_predict, len_input, num_of_vertices, d_model, d_k, d_v, n_heads):
    L_tilde = scaled_Laplacian(adj_mx)
    cheb_polynomials = [torch.from_numpy(i).type(torch.FloatTensor).to(DEVICE) for i in cheb_polynomial(L_tilde, K)]
    model = gwnet(DEVICE, num_of_vertices, dropout=0.3, gcn_bool=True, addaptadj=True,
                  in_dim=in_channels, out_dim=num_for_predict, residual_channels=nb_chev_filter,
                  dilation_channels=nb_chev_filter, skip_channels=nb_chev_filter * 8,
                  end_channels=nb_chev_filter * 16, layers=nb_block, seq_length=len_input,
                  embed_dim=10, dropout_ingc=0.5, eta=1, gamma=0.001, m=0.9,
                  batch_size=64, tc_dropout=0.1, n_heads=n_heads, d_model=d_model)

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
        else:
            nn.init.uniform_(p)

    return model

and this is the code for the training


import torch
import numpy as np
import argparse
import time
import configparser
from torch import nn

from util import *
from engine import trainer
from dataloader import get_dataloader
from metrics import All_Metrics

parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cuda:0', help='')
parser.add_argument('--dataset', type=str, default='PEMSD4', help='dataset')
parser.add_argument('--gcn_bool', type=bool, default=True, help='whether to add graph convolution layer')
parser.add_argument('--addaptadj', type=bool, default=True, help='whether add adaptive adj')
parser.add_argument('--seq_length', type=int, default=12, help='')
parser.add_argument('--nhid', type=int, default=32, help='')
parser.add_argument('--in_dim', type=int, default=1, help='inputs dimension')
parser.add_argument('--num_nodes', type=int, default=307, help='number of nodes')
parser.add_argument('--batch_size', type=int, default=32, help='batch size')
parser.add_argument('--learning_rate', type=float, default=0.001, help='learning rate')
parser.add_argument('--dropout', type=float, default=0.3, help='dropout rate')
parser.add_argument('--weight_decay', type=float, default=0., help='weight decay rate')
parser.add_argument('--epochs', type=int, default=200, help='')
parser.add_argument('--print_every', type=int, default=50, help='')
parser.add_argument('--seed', type=int, default=991, help='random seed')
parser.add_argument('--save', type=str, default='/kaggle/working/Methodology/results', help='save path')
parser.add_argument('--expid', type=int, default=1, help='experiment id')
parser.add_argument('--log_file', type=str, default='/kaggle/working/Methodology/results/pems4_log', help='log file')
parser.add_argument('--embed_dim', type=int, default=10, help='node dim')
parser.add_argument('--rate', type=int, default=1, help='')
parser.add_argument('--dropout_ingc', type=float, default=0.3, help='dynamic relation learning dropout')
parser.add_argument('--eta', type=float, default=1, help='node importance')
parser.add_argument('--gamma', type=float, default=0.01, help='graph sparsity')
parser.add_argument('--order', type=float, default=0.1, help='graph loss ratio in total loss')
parser.add_argument('--moco', type=float, default=0.1, help='middle guidance loss ratio')
parser.add_argument('--layers', type=int, default=3, help='number of layers')
parser.add_argument('--column_wise', type=bool, default=False)
parser.add_argument('--test_ratio', type=float, default=0.2)
parser.add_argument('--val_ratio', type=float, default=0.2)
parser.add_argument('--lag', type=int, default=12, help='input time windows length')
parser.add_argument('--horizon', type=int, default=12, help='predict window length')
parser.add_argument('--tc_dropout', type=float, default=0.,)
parser.add_argument('--dilation_exponential', type=int, default=1)
parser.add_argument('--d_model', type=int, default=64, help='dimension of model')
parser.add_argument('--n_heads', type=int, default=8, help='number of attention heads')

args = parser.parse_args()
log = open(args.log_file, 'w')

torch.set_num_threads(3)

def log_string(string, log=log):
    log.write(string + '\n')
    log.flush()
    print(string)

def main():
    args.seed = int(time.time())
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = True

    device = torch.device(args.device)

    train_dataloader, val_dataloader, test_dataloader, scaler, test_realy = get_dataloader(args, 'std', single=False)
    print('load dataset done')

    log_string(str(args))

    engine = trainer(scaler, args.in_dim, args.seq_length, args.num_nodes, args.nhid, args.dropout,
                     args.learning_rate, args.weight_decay, device, args.gcn_bool, args.addaptadj,
                     args.embed_dim, args.dropout_ingc, args.eta, args.gamma, args.order, args.moco,
                     args.layers, args.batch_size, args.tc_dropout, args.dilation_exponential,
                     d_model=args.d_model, n_heads=args.n_heads)

    print("start training...", flush=True)
    his_loss = []
    val_time = []
    train_time = []

    for i in range(1, args.epochs + 1):
        train_loss = []
        train_mape = []
        train_rmse = []
        t1 = time.time()

        for iter, (x, y) in enumerate(train_dataloader):
            trainx = x[..., :1]
            trainx = trainx.transpose(1, 3)
            trainy = y[..., :1]
            trainy = trainy.transpose(1, 3)
            metrics = engine.train(trainx, trainy, pred_time_embed=None, iter=iter)
            train_loss.append(metrics[0])
            train_mape.append(metrics[1])
            train_rmse.append(metrics[2])
            if iter % args.print_every == 0:
                log = 'Iter: {:03d}, {:.4f} :Train Loss, {:.4f}:Train MAPE, {:.4f}: Train RMSE'
                log_string(log.format(iter, train_loss[-1], train_mape[-1], train_rmse[-1]))

        t2 = time.time()
        train_time.append(t2 - t1)

        valid_loss = []
        valid_mape = []
        valid_rmse = []

        s1 = time.time()
        for iter, (x, y) in enumerate(val_dataloader):
            trainx = x[..., :1]
            trainx = trainx.transpose(1, 3)
            trainy = y[..., :1]
            trainy = trainy.transpose(1, 3)
            metrics = engine.eval(trainx, trainy, pred_time_embed=None)
            valid_loss.append(metrics[0])
            valid_mape.append(metrics[1])
            valid_rmse.append(metrics[2])
        engine.scheduler.step(np.mean(valid_loss))

        s2 = time.time()
        val_time.append(s2 - s1)
        mtrain_loss = np.mean(train_loss)
        mtrain_mape = np.mean(train_mape)
        mtrain_rmse = np.mean(train_rmse)

        mvalid_loss = np.mean(valid_loss)
        mvalid_mape = np.mean(valid_mape)
        mvalid_rmse = np.mean(valid_rmse)
        his_loss.append(mvalid_loss)

        log = 'Epoch: {:03d}, Train Loss: {:.4f}, Train MAPE: {:.4f}, Train RMSE: {:.4f}, Valid Loss: {:.4f}, ' \
              'Valid MAPE: {:.4f}, Valid RMSE: {:.4f}, Training Time: {:.4f}/epoch'
        log_string(log.format(i, mtrain_loss, mtrain_mape, mtrain_rmse,
                              mvalid_loss, mvalid_mape, mvalid_rmse, (t2 - t1)))

        torch.save(engine.model.state_dict(),
                   args.save + "_epoch_" + str(i) + ".pth")

    log_string("Average Training Time: {:.4f} secs/epoch".format(np.mean(train_time)))
    log_string("Average Inference Time: {:.4f} secs".format(np.mean(val_time)))

    bestid = np.argmin(his_loss)
    engine.model.load_state_dict(torch.load(args.save + "_epoch_" + str(bestid + 1) + ".pth"))
    engine.model.eval()

    outputs = []
    realy = torch.Tensor(test_realy).to(device)

    for iter, (x, y) in enumerate(test_dataloader):
        testx = x[..., :1]
        testx = testx.transpose(1, 3)
        testx = nn.functional.pad(testx, (1, 0, 0, 0))
        with torch.no_grad():
            preds, _, _, _ = engine.model(testx)
        outputs.append(preds.squeeze())

    yhat = torch.cat(outputs, dim=0)
    yhat = yhat[:realy.size(0), ...]

    log_string("The valid loss on best model is {}".format(str(round(his_loss[bestid], 4))))

    amae = []
    amape = []
    armse = []
    for i in range(12):
        pred = scaler.inverse_transform(yhat[:, :, i])
        real = realy[:, :, i]
        metrics = All_Metrics(pred, real, None, 0.)
        log_string('Evaluate best model on test data for horizon {:d}, Test MAE: {:.4f}, Test MAPE: {:.4f}, Test RMSE: {:.4f}'.format(
            i + 1, metrics[0], metrics[2], metrics[1]))
        amae.append(metrics[0])
        amape.append(metrics[2])
        armse.append(metrics[1])

    log_string('On average over 12 horizons, Test MAE: {:.4f}, Test MAPE: {:.4f}, Test RMSE: {:.4f}'.format(
        np.mean(amae), np.mean(amape), np.mean(armse)))
    torch.save(engine.model.state_dict(),
               args.save + "_exp" + str(args.expid) + "_best_" + str(args.order) + '_' + str(args.seed) + ".pth")

if __name__ == "__main__":
    t1 = time.time()
    main()
    t2 = time.time()
    print("Total time spent: {:.4f}".format(t2 - t1))

Any advice on how to deal with this error would be much appreciated and if any other details are needed please tell me.

You need to transform your input into a format the model expects. Without knowing any context about these architecture, a naive way is to use a series of fully-connected layers to embed the 307-dim down to the 12-dim that the model expects.