Hi, I an new to PyTorch. I am trying to solve a Time-Series problem using a CONV-LSTM. But the training loss does not seem to be decreasing. Can someone help me understand the reason.Below is my models architecture

I have a scaler target [-1 to 1] and the loss I am using is MAPE

My optimiser and schedular are

```
num_epochs = 25
optimizer = optim.Adam(model_to_train.parameters(), lr=1e-1 ) #,weight_decay=1e-3
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[int(num_epochs*0.25),int(num_epochs*0.5),int(num_epochs*0.75),int(num_epochs*0.9)], gamma=1e-1)
```

I am doing optimizer.zero_grad() and scheduler.step()

My data has aprox 71k samples with around 100 features each with 72 time steps

Thanks is advance and really appriciate the help.

```
class Conv2DLSTMModel(nn.Module):
'''
This Class defines a CONV LSTM
Parameters:
hidden_size_lstm = Number of hidden state units for LSTM
num_layers_lstm = Number of layers in LSTM
output_size = Size of the prediction
feature_num_extract = Number for the CONV layer to extract from feature_num_in as an input to LSTM layer
time_window = Time window to be considred at a time by CONV layer while extracting feature_num_extract number of features
feature_window = Feature to be clubbed at a time by CONV layer while extracting feature_num_extract number of features
'''
def __init__(self,in_channels, hidden_size_lstm, num_layers_lstm, output_size,feature_num_extract,feature_window,time_window,droup_out_prec = 0.2):
super(Conv2DLSTMModel, self).__init__()
self.hidden_size_lstm = hidden_size_lstm
self.num_layers_lstm = num_layers_lstm
# As the 2D conv will move along the Height and Width axis , We set Time as Height and features as Width. So we extract timeXfeatures values
self.conv1 = nn.Conv2d(in_channels= 1, out_channels = 2*feature_num_extract ,kernel_size = (time_window,feature_window), stride =1)
self.dropout1 = nn.Dropout(droup_out_prec)
self.conv2 = nn.Conv2d(in_channels= 2*feature_num_extract , out_channels = feature_num_extract,kernel_size = (time_window,feature_window), stride =2)
self.max_pool = nn.MaxPool2d(kernel_size=(4,4))
self.dropout2 = nn.Dropout(droup_out_prec)
self.lstm = nn.LSTM(input_size = feature_num_extract, hidden_size = hidden_size_lstm, num_layers = num_layers_lstm, batch_first=True,bidirectional = False)
self.dropout3 = nn.Dropout(droup_out_prec)
# self.tdd = nn.Conv2d(1, num_of_output_channels, (num_of_input_channels, 1))
self.fc1 = nn.Linear(hidden_size_lstm, int(hidden_size_lstm/2))
self.fc1.time_distributed = True
self.dropout4 = nn.Dropout(droup_out_prec)
self.fc2 = nn.Linear(int(hidden_size_lstm/2), output_size)
self.tanh = nn.Tanh()
self.tanh1 = nn.Tanh()
self.tanh2 = nn.Tanh()
self.sigmoid = nn.Sigmoid()
def forward(self, x):
# The conv layer
x = self.conv1(x)
x = self.dropout1(x)
x = self.conv2(x)
x = self.dropout2(x)
x= self.max_pool(x)
# reshapes the tensor without copying memory to colapse/flatten the last demention
x = x.view(x.size(0),x.size(1),-1 )
# Max pool 2D outputs (N,Chanel,W , H ) which we flatened into (N,channel,W X H) and LSTM takes (N,Lenght,Features) where feature is fixed so we make (N,W X H,channel)
# as last dim has to be fixed
x = torch.permute(x,(0,2,1)) #x.premute(0,2,1)
# We need self.hidden_size numbers as dummy hidden state for each input in batch [Batch size] for every LSTM layer. Thus (self.num_layers, x.size(0), self.hidden_size)
h0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Hidden state
c0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Cell state
out, (hn, cn) = self.lstm(x, (h0, c0))
out = self.tanh(out)
out = self.dropout3(out)
# LSTM output shape is (batch size,seq length,hidden state), we use -1 in seq length as we only need to feed the last time step output
out_reg = self.fc1(out[:, -1, :])
out_reg = self.tanh1(out_reg)
out_reg = self.dropout4(out_reg)
out_reg = self.fc2(out_reg)
out_reg = self.tanh2(out_reg)
return out_reg
```