Training does not progress using Pytorch LBFGS Optimizer

Hello everyone, I want to run L-BFGS algorithm additionally after pre-training with ADAM algorithm using LSTM. However, to check if this model works well, I trained it using 50 epochs each. Learning is going well with ADAM optimizer, but loss is not decreasing with LBFGS optimizer.
Please check if there is a problem in the learning process using LBFGS.
(Note that the reason the loss value is large is because it is the sum of many losses.)

Here is my code :

import torch
import torch.nn as nn     
import torch.optim as optim                     

import numpy as np
import scipy.io
import time
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from random import shuffle

if torch.cuda.is_available():
	device = torch.device('cuda')
else:
	device = torch.device('cpu')

torch.backends.cudnn.benchmark = True


class DeepPhyLSTM(nn.Module):
	#-------------------------------
	# 1. Define Variables
	#-------------------------------
	def __init__ (self, u, ut, g, ag, ag_c, lift_c, Phi_t):
		super(DeepPhyLSTM, self).__init__()
		
		# 1) Define required data; Training data
		self.u         = u
		self.ut        = ut
		self.g         = g
		self.ag        = ag
		self.ag_c      = ag_c
		self.lift_c    = lift_c
		self.Phi_t     = Phi_t
		
		# 2) Device setup
		self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
		
		# 3) Layer
		# 3-1) network1 layer
		self.lstm1 = nn.LSTM(1, 100, batch_first=True).to(self.device)
		self.fc1   = nn.Linear(100, 3 * u.shape[2]).to(self.device)
		
		#3-2) network2 layer
		self.lstm2 = nn.LSTM(3*self.u.shape[2],100,batch_first=True).to(self.device)
		self.fc2   = nn.Linear(100,self.u.shape[2]).to(self.device)

		# 3-3) common layer
		self.lstm = nn.LSTM(100, 100, batch_first=True).to(self.device)
		self.fc   = nn.Linear(100, 100).to(self.device)
		self.relu = nn.ReLU()
		
		# 4) data to device
		self.u_py      = u.clone().detach().to(self.device)
		self.ut_py     = ut.clone().detach().to(self.device)
		self.g_py      = g.clone().detach().to(self.device)
		self.ag_py     = ag.clone().detach().to(self.device)
		self.lift_c_py = lift_c.clone().detach().to(self.device)
		self.ag_c_py   = ag_c.clone().detach().to(self.device)
		self.Phi_py    = Phi_t.clone().detach().to(self.device)
		
		# 5) Neural network
		# 5-1) Train ag sample(10)
		self.u_pred   ,   self.ut_pred    ,   self.utt_pred   ,self.u_dot_pred,  self.z_pred     = self.network1(self.ag_py)
	
		# 5-2) Collocation ag sample(50)
		self.ut_c_pred , self.u_dot_c_pred, self.lift_c_pred   = self.network2(self.ag_c_py)
		

	#-------------------------------    
	# 2. Define layer
	#-------------------------------
	def LSTM1(self,X):
		out, _ = self.lstm1(X)
		out    = self.relu(out)

		out, _ = self.lstm(out)
		out    = self.relu(out)

		out, _ = self.lstm(out)
		out    = self.relu(out)

		out, _ = self.lstm(out)
		out    = self.relu(out)
		
		out    = self.fc(out)
		out    = self.relu(out)
		output = self.fc1(out)
		#print(output.shape)
		return output
		
	
	def LSTM2(self,X):
		out, _ = self.lstm2(X)
		out    = self.relu(out)

		out, _ = self.lstm(out)
		out    = self.relu(out)

		out, _ = self.lstm(out)
		out    = self.relu(out)

		out, _ = self.lstm(out)
		out    = self.relu(out)
		
		out    = self.fc(out)
		out    = self.relu(out)
		output = self.fc2(out)
		return output 
	
	
	#-------------------------------    
	# 3. Define network architecture
	#-------------------------------
	def network1(self, ag):
		output  = self.LSTM1(ag).to(self.device)                     
		u_pred     = output[:, :, 0:self.u.shape[2]]                   
		u_dot_pred = output[:, :, self.u.shape[2]:2*self.u.shape[2]]   
		z_pred     = output[:, :, 2*self.u.shape[2]:]                 
		
		batch_size  = ag.shape[0]
		#self.Phi_py = torch.empty((batch_size, self.u.shape[1], self.u.shape[1]), dtype=torch.float32).to(self.device)
		self.Phi_py = self.Phi_t.repeat(batch_size, 1 ,1).to(self.device)
		ut_pred  = torch.matmul(self.Phi_py, u_pred)
		utt_pred = torch.matmul(self.Phi_py, ut_pred)
		return u_pred, ut_pred, utt_pred, u_dot_pred, z_pred
	
	
	def network2(self, ag_c):
		u_c_pred, ut_c_pred, utt_c_pred, u_dot_c_pred, z_c_pred = self.network1(ag_c)
		u_dot_c_pred1 = u_dot_c_pred[:,:,0:1]
		
		f = self.LSTM2(torch.concat([u_c_pred, u_dot_c_pred1, z_c_pred], 2)).to(self.device)
		lift_c_pred = utt_c_pred + f
		return ut_c_pred, u_dot_c_pred, lift_c_pred
	
	
	#-------------------------------    
	# 4. Define predict function
	#-------------------------------
	def predict_z(self, ag_star):
		u_star, ut_star, utt_star, u_dot_star, g_star = self.network1(ag_star)
			
		return (u_star.detach().cpu().numpy(),
				ut_star.detach().cpu().numpy(), 
				utt_star.detach().cpu().numpy(), 
				u_dot_star.detach().cpu().numpy(), 
				g_star.detach().cpu().numpy())
			
	def predict_g(self, ag_star):
		_,_, lift_star = self.network2(ag_star)
			
		return lift_star.detach().cpu().numpy()
model = DeepPhyLSTM(u_train, ut_train, g_train, ag_train, ag_c_train, lift_c_train, Phi_t0_train).to(device)

# 1) Train function define
def train(model, adam_epochs, lbfgs_epochs, learning_rate):
	model.train()
	
	Loss_u     = []
	Loss_udot  = []
	Loss_g     = []
	Loss_ut_c  = []
	Loss_e     = []
	Total_Loss = []
	Loss_val   = []
	best_loss  = 100
	
	Ind_Train = []
	
	# 1-1) optimizer define
	optim_adam  = optim.Adam(model.parameters(), lr=1e-3)
	optim_lbfgs = optim.LBFGS(model.parameters(), lr = 0.001, max_iter = 20, history_size = 10)
	# 1-2) loss define
	criterion = torch.nn.MSELoss()

	for epoch in range(adam_epochs + lbfgs_epochs):
		
		# Train, Valid data split; changes every epoch 
		Ind = list(range(ag_train.shape[0]))  # Ind = 10(train data num)
		shuffle(Ind)
		ratio_split = 0.8  # [Train/Valid] = [0.8/0.2] ratio
		Ind_Tr  = Ind[0:round(ratio_split*ag_train.shape[0])]
		Ind_Val = Ind[round(ratio_split*ag_train.shape[0]):]
		
		Ind_Train.append(Ind_Tr)
		
		# Train data
		ag_Tr  = ag_train[Ind_Tr].to(device)
		u_Tr   = u_train[Ind_Tr].to(device)
		ut_Tr  = ut_train[Ind_Tr].to(device)
		g_Tr   = g_train[Ind_Tr].to(device)
		
		# Valid data
		ag_Val = ag_train[Ind_Val].to(device)
		u_Val  = u_train[Ind_Val].to(device)
		ut_Val = ut_train[Ind_Val].to(device)
		g_Val  = g_train[Ind_Val].to(device)
		
		# Start time; as of 1970/1/1(UTC) 
		start_time = time.time()
		
		
		if epoch <= adam_epochs:
			# Gradient initialize
			optim_adam.zero_grad()
		
			# Train; Network predict
			u_pred   , ut_pred     , utt_pred   , u_dot_pred, z_pred = model.network1(ag_Tr)
			ut_c_pred, u_dot_c_pred, lift_c_pred = model.network2(ag_c_train)
		
			# Loss calculation
			loss_u_value    = criterion(u_pred     , u_Tr)
			loss_udot_value = criterion(u_dot_pred , ut_Tr)
			loss_g_value    = criterion(z_pred     , g_Tr)
			loss_ut_c_value = criterion(ut_c_pred  , u_dot_c_pred)
			loss_e_value    = criterion(lift_c_pred, lift_c_train)  
			total_loss      = loss_u_value + loss_udot_value + loss_ut_c_value + loss_e_value
		
			# Gradient calculation
			total_loss.backward()
		
			# Gradient update
			optim_adam.step()

		else:
			def closure():
				optim_lbfgs.zero_grad()
									
				u_pred   , ut_pred     , utt_pred   , u_dot_pred, z_pred = model.network1(ag_train)
				ut_c_pred, u_dot_c_pred, lift_c_pred = model.network2(ag_c_train)
									
				# Loss calculation
				loss_u_value    = criterion(u_pred     , u_train)
				loss_udot_value = criterion(u_dot_pred , ut_train)
				loss_g_value    = criterion(z_pred     , g_train)
				loss_ut_c_value = criterion(ut_c_pred  , u_dot_c_pred)
				loss_e_value    = criterion(lift_c_pred, lift_c_train)
				total_loss      = loss_u_value + loss_udot_value + loss_ut_c_value + loss_e_value
									
				# Gradient calculation
				total_loss.backward()
				return total_loss 
							
			optim_lbfgs.step(closure)
			
			
		# Loss append
		Loss_u.append(loss_u_value.item())
		Loss_udot.append(loss_udot_value.item())
		Loss_g.append(loss_g_value.item())
		Loss_ut_c.append(loss_ut_c_value.item())
		Loss_e.append(loss_e_value.item())
		Total_Loss.append(total_loss.item())
		
		# Valid
		with torch.no_grad():
			u_val_pred, ut_val_pred, utt_val_pred, u_dot_val_pred, z_val_pred = model.network1(ag_Val.to(device))
			ut_c_val_pred, u_dot_c_val_pred, lift_val_pred = model.network2(ag_c_train.to(device))
			
			loss_u_valid    = criterion(u_val_pred     , u_Val)
			loss_udot_valid = criterion(u_dot_val_pred , ut_Val)
			loss_g_valid    = criterion(z_val_pred     , g_Val)
			loss_ut_c_valid = criterion(ut_val_pred    , u_dot_val_pred)
			loss_e_valid    = criterion(lift_val_pred  , lift_c_train)
			valid_loss      = loss_u_valid + loss_udot_valid + loss_ut_c_valid + loss_e_valid
			Loss_val.append(valid_loss.item())
			
		if total_loss.item() < best_loss:
			best_loss = total_loss.item()
		
		# Time taken per epoch
		elapsed = time.time() - start_time
		if epoch % 10==0:
			print('[epoch : %d] loss: %.4f / Best loss: %.4f /Time: %.2f' %(epoch, total_loss.item(), best_loss, elapsed))
			

	return Loss_u, Loss_udot, Loss_g, Loss_ut_c, Loss_e, Total_Loss, Ind_Train, Loss_val, best_loss
	
	
	
#%%--------------------------------"Train"-------------------------------------
# 1) Train
Loss_u, Loss_udot, Loss_g, Loss_ut_c, Loss_e, Total_Loss, Ind_Train, Loss_val, best_loss = train(model = model, adam_epochs = 50, lbfgs_epochs = 50, learning_rate = 1e-3)
train_loss = Total_Loss
valid_loss = Loss_val


# 2) Loss plot
plt.figure(figsize=(20,10))
plt.plot(train_loss)
plt.plot(valid_loss)
plt.show()