Hello everyone, I want to run L-BFGS algorithm additionally after pre-training with ADAM algorithm using LSTM. However, to check if this model works well, I trained it using 50 epochs each. Learning is going well with ADAM optimizer, but loss is not decreasing with LBFGS optimizer.
Please check if there is a problem in the learning process using LBFGS.
(Note that the reason the loss value is large is because it is the sum of many losses.)
Here is my code :
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import scipy.io
import time
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from random import shuffle
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
torch.backends.cudnn.benchmark = True
class DeepPhyLSTM(nn.Module):
#-------------------------------
# 1. Define Variables
#-------------------------------
def __init__ (self, u, ut, g, ag, ag_c, lift_c, Phi_t):
super(DeepPhyLSTM, self).__init__()
# 1) Define required data; Training data
self.u = u
self.ut = ut
self.g = g
self.ag = ag
self.ag_c = ag_c
self.lift_c = lift_c
self.Phi_t = Phi_t
# 2) Device setup
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 3) Layer
# 3-1) network1 layer
self.lstm1 = nn.LSTM(1, 100, batch_first=True).to(self.device)
self.fc1 = nn.Linear(100, 3 * u.shape[2]).to(self.device)
#3-2) network2 layer
self.lstm2 = nn.LSTM(3*self.u.shape[2],100,batch_first=True).to(self.device)
self.fc2 = nn.Linear(100,self.u.shape[2]).to(self.device)
# 3-3) common layer
self.lstm = nn.LSTM(100, 100, batch_first=True).to(self.device)
self.fc = nn.Linear(100, 100).to(self.device)
self.relu = nn.ReLU()
# 4) data to device
self.u_py = u.clone().detach().to(self.device)
self.ut_py = ut.clone().detach().to(self.device)
self.g_py = g.clone().detach().to(self.device)
self.ag_py = ag.clone().detach().to(self.device)
self.lift_c_py = lift_c.clone().detach().to(self.device)
self.ag_c_py = ag_c.clone().detach().to(self.device)
self.Phi_py = Phi_t.clone().detach().to(self.device)
# 5) Neural network
# 5-1) Train ag sample(10)
self.u_pred , self.ut_pred , self.utt_pred ,self.u_dot_pred, self.z_pred = self.network1(self.ag_py)
# 5-2) Collocation ag sample(50)
self.ut_c_pred , self.u_dot_c_pred, self.lift_c_pred = self.network2(self.ag_c_py)
#-------------------------------
# 2. Define layer
#-------------------------------
def LSTM1(self,X):
out, _ = self.lstm1(X)
out = self.relu(out)
out, _ = self.lstm(out)
out = self.relu(out)
out, _ = self.lstm(out)
out = self.relu(out)
out, _ = self.lstm(out)
out = self.relu(out)
out = self.fc(out)
out = self.relu(out)
output = self.fc1(out)
#print(output.shape)
return output
def LSTM2(self,X):
out, _ = self.lstm2(X)
out = self.relu(out)
out, _ = self.lstm(out)
out = self.relu(out)
out, _ = self.lstm(out)
out = self.relu(out)
out, _ = self.lstm(out)
out = self.relu(out)
out = self.fc(out)
out = self.relu(out)
output = self.fc2(out)
return output
#-------------------------------
# 3. Define network architecture
#-------------------------------
def network1(self, ag):
output = self.LSTM1(ag).to(self.device)
u_pred = output[:, :, 0:self.u.shape[2]]
u_dot_pred = output[:, :, self.u.shape[2]:2*self.u.shape[2]]
z_pred = output[:, :, 2*self.u.shape[2]:]
batch_size = ag.shape[0]
#self.Phi_py = torch.empty((batch_size, self.u.shape[1], self.u.shape[1]), dtype=torch.float32).to(self.device)
self.Phi_py = self.Phi_t.repeat(batch_size, 1 ,1).to(self.device)
ut_pred = torch.matmul(self.Phi_py, u_pred)
utt_pred = torch.matmul(self.Phi_py, ut_pred)
return u_pred, ut_pred, utt_pred, u_dot_pred, z_pred
def network2(self, ag_c):
u_c_pred, ut_c_pred, utt_c_pred, u_dot_c_pred, z_c_pred = self.network1(ag_c)
u_dot_c_pred1 = u_dot_c_pred[:,:,0:1]
f = self.LSTM2(torch.concat([u_c_pred, u_dot_c_pred1, z_c_pred], 2)).to(self.device)
lift_c_pred = utt_c_pred + f
return ut_c_pred, u_dot_c_pred, lift_c_pred
#-------------------------------
# 4. Define predict function
#-------------------------------
def predict_z(self, ag_star):
u_star, ut_star, utt_star, u_dot_star, g_star = self.network1(ag_star)
return (u_star.detach().cpu().numpy(),
ut_star.detach().cpu().numpy(),
utt_star.detach().cpu().numpy(),
u_dot_star.detach().cpu().numpy(),
g_star.detach().cpu().numpy())
def predict_g(self, ag_star):
_,_, lift_star = self.network2(ag_star)
return lift_star.detach().cpu().numpy()
model = DeepPhyLSTM(u_train, ut_train, g_train, ag_train, ag_c_train, lift_c_train, Phi_t0_train).to(device)
# 1) Train function define
def train(model, adam_epochs, lbfgs_epochs, learning_rate):
model.train()
Loss_u = []
Loss_udot = []
Loss_g = []
Loss_ut_c = []
Loss_e = []
Total_Loss = []
Loss_val = []
best_loss = 100
Ind_Train = []
# 1-1) optimizer define
optim_adam = optim.Adam(model.parameters(), lr=1e-3)
optim_lbfgs = optim.LBFGS(model.parameters(), lr = 0.001, max_iter = 20, history_size = 10)
# 1-2) loss define
criterion = torch.nn.MSELoss()
for epoch in range(adam_epochs + lbfgs_epochs):
# Train, Valid data split; changes every epoch
Ind = list(range(ag_train.shape[0])) # Ind = 10(train data num)
shuffle(Ind)
ratio_split = 0.8 # [Train/Valid] = [0.8/0.2] ratio
Ind_Tr = Ind[0:round(ratio_split*ag_train.shape[0])]
Ind_Val = Ind[round(ratio_split*ag_train.shape[0]):]
Ind_Train.append(Ind_Tr)
# Train data
ag_Tr = ag_train[Ind_Tr].to(device)
u_Tr = u_train[Ind_Tr].to(device)
ut_Tr = ut_train[Ind_Tr].to(device)
g_Tr = g_train[Ind_Tr].to(device)
# Valid data
ag_Val = ag_train[Ind_Val].to(device)
u_Val = u_train[Ind_Val].to(device)
ut_Val = ut_train[Ind_Val].to(device)
g_Val = g_train[Ind_Val].to(device)
# Start time; as of 1970/1/1(UTC)
start_time = time.time()
if epoch <= adam_epochs:
# Gradient initialize
optim_adam.zero_grad()
# Train; Network predict
u_pred , ut_pred , utt_pred , u_dot_pred, z_pred = model.network1(ag_Tr)
ut_c_pred, u_dot_c_pred, lift_c_pred = model.network2(ag_c_train)
# Loss calculation
loss_u_value = criterion(u_pred , u_Tr)
loss_udot_value = criterion(u_dot_pred , ut_Tr)
loss_g_value = criterion(z_pred , g_Tr)
loss_ut_c_value = criterion(ut_c_pred , u_dot_c_pred)
loss_e_value = criterion(lift_c_pred, lift_c_train)
total_loss = loss_u_value + loss_udot_value + loss_ut_c_value + loss_e_value
# Gradient calculation
total_loss.backward()
# Gradient update
optim_adam.step()
else:
def closure():
optim_lbfgs.zero_grad()
u_pred , ut_pred , utt_pred , u_dot_pred, z_pred = model.network1(ag_train)
ut_c_pred, u_dot_c_pred, lift_c_pred = model.network2(ag_c_train)
# Loss calculation
loss_u_value = criterion(u_pred , u_train)
loss_udot_value = criterion(u_dot_pred , ut_train)
loss_g_value = criterion(z_pred , g_train)
loss_ut_c_value = criterion(ut_c_pred , u_dot_c_pred)
loss_e_value = criterion(lift_c_pred, lift_c_train)
total_loss = loss_u_value + loss_udot_value + loss_ut_c_value + loss_e_value
# Gradient calculation
total_loss.backward()
return total_loss
optim_lbfgs.step(closure)
# Loss append
Loss_u.append(loss_u_value.item())
Loss_udot.append(loss_udot_value.item())
Loss_g.append(loss_g_value.item())
Loss_ut_c.append(loss_ut_c_value.item())
Loss_e.append(loss_e_value.item())
Total_Loss.append(total_loss.item())
# Valid
with torch.no_grad():
u_val_pred, ut_val_pred, utt_val_pred, u_dot_val_pred, z_val_pred = model.network1(ag_Val.to(device))
ut_c_val_pred, u_dot_c_val_pred, lift_val_pred = model.network2(ag_c_train.to(device))
loss_u_valid = criterion(u_val_pred , u_Val)
loss_udot_valid = criterion(u_dot_val_pred , ut_Val)
loss_g_valid = criterion(z_val_pred , g_Val)
loss_ut_c_valid = criterion(ut_val_pred , u_dot_val_pred)
loss_e_valid = criterion(lift_val_pred , lift_c_train)
valid_loss = loss_u_valid + loss_udot_valid + loss_ut_c_valid + loss_e_valid
Loss_val.append(valid_loss.item())
if total_loss.item() < best_loss:
best_loss = total_loss.item()
# Time taken per epoch
elapsed = time.time() - start_time
if epoch % 10==0:
print('[epoch : %d] loss: %.4f / Best loss: %.4f /Time: %.2f' %(epoch, total_loss.item(), best_loss, elapsed))
return Loss_u, Loss_udot, Loss_g, Loss_ut_c, Loss_e, Total_Loss, Ind_Train, Loss_val, best_loss
#%%--------------------------------"Train"-------------------------------------
# 1) Train
Loss_u, Loss_udot, Loss_g, Loss_ut_c, Loss_e, Total_Loss, Ind_Train, Loss_val, best_loss = train(model = model, adam_epochs = 50, lbfgs_epochs = 50, learning_rate = 1e-3)
train_loss = Total_Loss
valid_loss = Loss_val
# 2) Loss plot
plt.figure(figsize=(20,10))
plt.plot(train_loss)
plt.plot(valid_loss)
plt.show()