Conv-LSTM NOT reducing loss during tranning

Hi, I an new to PyTorch. I am trying to solve a Time-Series problem using a CONV-LSTM. But the training loss does not seem to be decreasing. Can someone help me understand the reason.Below is my models architecture

I have a scaler target [-1 to 1] and the loss I am using is MAPE

My optimiser and schedular are

num_epochs = 25
optimizer = optim.Adam(model_to_train.parameters(), lr=1e-1 ) #,weight_decay=1e-3
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[int(num_epochs*0.25),int(num_epochs*0.5),int(num_epochs*0.75),int(num_epochs*0.9)], gamma=1e-1)

I am doing optimizer.zero_grad() and scheduler.step()
My data has aprox 71k samples with around 100 features each with 72 time steps

Thanks is advance and really appriciate the help.

class Conv2DLSTMModel(nn.Module):
    '''
    This Class defines a CONV LSTM

    Parameters:
    hidden_size_lstm = Number of hidden state units for LSTM
    num_layers_lstm = Number of layers in LSTM
    output_size = Size of the prediction
    feature_num_extract = Number for the CONV layer to extract from feature_num_in as an input to LSTM layer
    time_window = Time window to be considred at a time by CONV layer while extracting feature_num_extract number of features
    feature_window = Feature to be clubbed at a time by CONV layer while extracting feature_num_extract number of features
    '''

    def __init__(self,in_channels, hidden_size_lstm, num_layers_lstm, output_size,feature_num_extract,feature_window,time_window,droup_out_prec = 0.2):

        super(Conv2DLSTMModel, self).__init__()
        self.hidden_size_lstm = hidden_size_lstm
        self.num_layers_lstm = num_layers_lstm
        
        # As the 2D conv will move along the Height and Width axis , We set Time as Height and features as Width. So we extract timeXfeatures values 
        self.conv1 = nn.Conv2d(in_channels= 1, out_channels = 2*feature_num_extract ,kernel_size = (time_window,feature_window), stride =1)
        self.dropout1 = nn.Dropout(droup_out_prec)
        self.conv2 = nn.Conv2d(in_channels= 2*feature_num_extract , out_channels = feature_num_extract,kernel_size = (time_window,feature_window), stride =2)
        
        self.max_pool = nn.MaxPool2d(kernel_size=(4,4))

        self.dropout2 = nn.Dropout(droup_out_prec)

        self.lstm = nn.LSTM(input_size = feature_num_extract, hidden_size  = hidden_size_lstm, num_layers = num_layers_lstm, batch_first=True,bidirectional = False)

        self.dropout3 = nn.Dropout(droup_out_prec)
        
        # self.tdd = nn.Conv2d(1, num_of_output_channels, (num_of_input_channels, 1))

        self.fc1 = nn.Linear(hidden_size_lstm, int(hidden_size_lstm/2))
        self.fc1.time_distributed = True

        self.dropout4 = nn.Dropout(droup_out_prec)

        self.fc2 = nn.Linear(int(hidden_size_lstm/2), output_size)

        self.tanh = nn.Tanh()
        self.tanh1 = nn.Tanh()
        self.tanh2 = nn.Tanh()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):

        # The conv layer
        x = self.conv1(x)
        x = self.dropout1(x)

        x = self.conv2(x)
        x = self.dropout2(x)

        x= self.max_pool(x)

        # reshapes the tensor without copying memory to colapse/flatten the last demention
        x = x.view(x.size(0),x.size(1),-1 )


        # Max pool 2D outputs (N,Chanel,W , H ) which we flatened into (N,channel,W X H) and LSTM takes (N,Lenght,Features) where feature is fixed so we make (N,W X H,channel) 
        # as last dim has to be fixed
        x = torch.permute(x,(0,2,1)) #x.premute(0,2,1)

        
        # We need self.hidden_size numbers as dummy hidden state for each input in batch [Batch size] for every LSTM layer. Thus (self.num_layers, x.size(0), self.hidden_size)
        h0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Hidden state
        c0 = torch.zeros(self.num_layers_lstm, x.size(0), self.hidden_size_lstm).to(device=x.device) # Cell state

        


        out, (hn, cn) = self.lstm(x, (h0, c0))
        out = self.tanh(out)
        out = self.dropout3(out)
        
        # LSTM output shape is (batch size,seq length,hidden state), we use -1 in seq length as we only need to feed the last time step output

        out_reg = self.fc1(out[:, -1, :])
        out_reg = self.tanh1(out_reg)
        out_reg = self.dropout4(out_reg)

        out_reg = self.fc2(out_reg)
        out_reg = self.tanh2(out_reg)



        return out_reg