TBTT with a multivariate time series

Hello everyone,

I am trying to use TBPTT on a multivariate time series, and I am facing a problem, my loss doesn’t decrease, and I don’t know what I am doing wrong.

Inputs shape (Batch_size,1270,6)
Output shape (Batch_size,1270)

There is a particularity with the Inputs:

  • 6 Features correspond to A-B A-C A-D where A is the time step,

  • Between two inputs (Inputs[0] and Inputs[1]) features don’t have the same length, I padded all the Inputs using torch.nn.utils.rnn.pad_sequence(Mise_en_donnees,padding_value=-1,batch_first=True) . I tried padding_value=0. But it doesn’t change anything)

  • All Inputs are normalized using get_mean_std

    def get_mean_std(loader,ignore_idx=-1.):
        channels_sum,channels_squared_sum,num_batches=0,0,0
        for data in loader:
            a=torch.sum((data[:,0]!=ignore_idx)).item()-1
            channels_sum+=torch.mean(data[:a],dim=[0])
            channels_squared_sum+=torch.mean(data[:a]**2,dim=[0])
            num_batches+=1
        mean=channels_sum/num_batches
        std=(channels_squared_sum/num_batches -mean**2)**0.5
        return mean,std


There is my Model

#A classic Conv_Block
class conv_block (nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(conv_block, self).__init__()
        self.relu = nn.LeakyReLU()
        self.conv = nn.Conv1d(in_channels, out_channels, **kwargs)
        self.batchnorm = nn.BatchNorm1d(out_channels)
        

    def forward(self, x):
        x=self.conv(x)
        x= self.batchnorm(x)
        return self.relu(x)


class Test (nn.Module):
    def __init__(self,in_channels,num_layers,hidden_size, p,out_size):
        super(Test ,self).__init__()
        
        self.CNN=nn.Sequential(
           #I am trying to apply filters on every two columns (A-B  A-C  A-D) using groups
            conv_block(in_channels,3,kernel_size=2,stride=1,padding=1,groups=3),#,padding_mode="reflect"), 
            conv_block(3,32,kernel_size=2,stride=1,padding=0),
            #SqueezeExcitation(32,16), #i tried but same results
            conv_block(32,16,kernel_size=3,stride=1,padding=1),
            conv_block(16,8,kernel_size=3,stride=1,padding=1),
           
        )

        self.rnn = nn.LSTM(8, hidden_size, num_layers)
        self.rnn1 = nn.LSTM(hidden_size, hidden_size, num_layers)
        #self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size) # in case of using bidirectional 
        #self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(p)
        self.num_layers=num_layers
        self.fc_f=nn.Linear(out_size*hidden_size,out_size)
        
    def forward(self,x,hidden, cell):
        x=x.permute(0,2,1)
        x=self.CNN(x)
        x=x.permute(2,0,1)
        x, (hidden, cell) = self.rnn(x) #i tried bidirectional but same results
        #hidden = self.dropout(self.fc_hidden(torch.cat((hidden[0:self.num_layers], hidden[self.num_layers:2*self.num_layers]), dim=2)))
        #cell = self.dropout(self.fc_cell(torch.cat((cell[0:self.num_layers], cell[self.num_layers:2*self.num_layers]), dim=2)))
        x, (hidden, cell) = self.rnn1(x, (hidden, cell))
        #hidden=hidden.repeat(2,1,1)
        #cell=cell.repeat(2,1,1)
        x=x.permute(1,0,2)
        x=x.reshape(x.shape[0],-1)
        x=self.fc_f(x) #final result

        return x, hidden, cell

#hyperparameters

in_channels=6
num_layers=64 
hidden_size=90  
p=0.2
out_size=tbptt_steps=20 #truncated bptt steps
split_dim=1
nb_epoch=100
learning_rate=3e-4

Model=Test(in_channels,num_layers,hidden_size, p,out_size).to(device)
optimizer = optim.Adam(Model.parameters(), lr=learning_rate)

# I tired to test my model on the same inputs

X=Inputs[:5,:500,:-1].to(device)
Y=Inputs[:5,:500,-1].to(device)

#training loop

hidden=None
cell=None

for ep in range (nb_epoch):

    Losses=0
    for i, (x_, y_) in enumerate(zip(X.split(tbptt_steps, dim=split_dim), Y.split(tbptt_steps, dim=split_dim))):
        optimizer.zero_grad()
        #Model.train()

        # Detach last hidden state, so the backprop-graph will be cut
        if hidden is not None:
            hidden.detach_()
        if cell is not None:
            cell.detach_()

        # Forward path
        y_pred, hidden, cell = Model(x_, hidden, cell)
        #print("predict",y_pred.shape,y_.shape)
        # Compute loss
        loss = nn.functional.mse_loss(y_, y_pred)
        # Backward path
        loss.backward()
        Losses+=loss.item()
        # Update weights
        optimizer.step()
        if i==0:
            print("Epoch ",ep," Loss ",loss.item())
        
    print("#################################################")
    print(Losses)
    print("#################################################")
    

There is two problems with this Model:
- It doesn’t catch the padding_value
-The loss is high and didn’t decrease

I really hope that the Model is understandable, and we will correct it.
As you can see I am not a professional in Machine learning, I am really eager to understand more about my errors .

Thank you very much for your help