Can you check my attention based Bi-LSTM model probelm?

I’m studying stock prediction using embedding and attention based BI-LSTM. But model loss is not decreasing. I think my train and test dataset have not problem. My code is

class attbilstm(nn.Module):
    def __init__(self,label_size,embedding_dim,hidden_dim):#label_size=vocab_num(104), embedding_dim=30, hidden_dim=128
        super().__init__()
        self.hidden_dim=hidden_dim
        self.embedding=nn.Embedding(label_size,embedding_dim)
        self.bilstm   =nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,num_layers=1,bidirectional=True,dropout=0.2,batch_first=True)
        
        self.fc =nn.Linear(self.hidden_dim,3)
        self.act=nn.Softmax(dim=1)

        self.dropout=nn.Dropout(0.2)


    def attnetwork(self,bilstm_out,final_hidden):
        hidden=final_hidden.squeeze(0)
        attn_weights=torch.bmm(bilstm_out,hidden.unsqueeze(2)).squeeze(2) #weight(similarity) = query(time sequence 60) @ key(final h 1)
        soft_attn_weights=F.softmax(attn_weights,1)
        new_hidden=torch.bmm(bilstm_out.transpose(1,2),soft_attn_weights.unsqueeze(2)).squeeze(2) #attention(k,q,v) = value @ weight(similarity)
        act=nn.Tanh()
        attn_out=act(new_hidden)
        return attn_out


    def init_hidden(self,batch_size):
        #input shape: num_layer*num_directions, batch_size, hidden_size
        h0=Variable(torch.zeros(1*2,batch_size,self.hidden_dim))
        h0=h0.to(device)
        c0=Variable(torch.zeros(1*2,batch_size,self.hidden_dim))
        c0=c0.to(device)
        return h0,c0


    def forward(self, x): #input x is [500 batch size x 30 time step x 6 feature]
        x=self.embedding(x)
        x=x.view(x.size(0),-1,x.size(3))
        x=self.dropout(x)
        out,(hn,cn)=self.bilstm(x,self.init_hidden(x.size(0)))
        fbout=out[:,:,:self.hidden_dim]+out[:,:,self.hidden_dim:] #bilstm forward + bacward output
        fbhn=(hn[-2,:,:]+hn[-1,:,:]).unsqueeze(0) #bilstm remain final(top) hn

        attn_out=self.attnetwork(fbout,fbhn)
        output=self.fc(attn_out)
        output=self.act(output)

        return output

and model train code is

def model_train(self,model,x_train,y_train):
        model.train()
        dataset=TensorDataset(x_train,y_train)
        dataloader=DataLoader(dataset,batch_size=x_train.size(0)//2,shuffle=True) #batch_size split
        if self.do_retrain == False:
            print("Training once...")
            optimizer=optim.SGD(model.parameters(), lr=0.001) #, weight_decay=1e-5
            criterion=nn.MSELoss()
            for epoch in range(self.epochs):
                for _,samples in enumerate(dataloader):
                    optimizer.zero_grad()
                    x_train,y_train=samples
                    output=model.forward(x_train)
                    loss = criterion(output,y_train)
                    loss.backward()
                    optimizer.step()
                if (epoch+1)%100==0:
                    print('Epoch: %d, loss: %1.5f' % (epoch+1, loss.item()))
            return model
        else:
            print("Re-training...")
            optimizer=optim.SGD(model.parameters(), lr=0.0001) #, weight_decay=1e-6
            criterion=nn.MSELoss()
            for epoch in range(self.epochs_r):
                for _,samples in enumerate(dataloader):
                    optimizer.zero_grad()
                    x_train,y_train=samples
                    output=model.forward(x_train)
                    loss=criterion(output,y_train)
                    loss.backward()
                    optimizer.step()
                if (epoch+1)%10==0:
                    print('Epoch: %d, loss: %1.5f' % (epoch+1, loss.item()))
            return model

and train_x shape = [500, 30, 6] train_y shape = [500, 3] test_x shape = [1, 30, 6]
test_y =
image

Thank you!