TypeError: sub(): argument 'other' (position 1) must be Tensor, not NoneType

I wrote a program that resumes learning by saving the data in the middle of learning and reading it as follows. However, the above error occurs and cannot be resolved. Anyone know a good solution?

torch == 1.12.1

import torch
import torch.autograd as autograd
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import time
import copy

torch.set_default_dtype(torch.float64)

torch.manual_seed(123)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#When to resume learning : switch = 1
switch = 1

learning = torch.rand((1000, 5))
teacher = torch.rand((1000, 2))

class Sequentialmodel(nn.Module):
    def __init__(self,layers):
        super(Sequentialmodel, self).__init__()

        self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i+1]) for i in range(len(layers)-1)])

        'activation function'
        self.activation = nn.Tanh()

        'loss function'
        self.loss_function = nn.MSELoss(reduction ='mean')

        self.loss_hist = []

        self.iter = 0

    def forward(self, x):
        
        for i in range(len(layers)-2):         
            z = self.linears[i](x)
            x = self.activation(z)

        x = self.linears[-1](x)
        
        return x
    
    def loss_cal(self,learning, teacher):
        learning.requires_grad = True
        output = self.forward(learning)

        return self.loss_function(output, teacher)
    
    def closure(self):
        optimizer.zero_grad()

        loss = self.loss_cal(learning, teacher)
        
        self.loss_hist.append(loss.item())
      
        loss.backward()
                
        self.iter += 1
        
        loss = loss.to('cpu').detach().numpy()

        if self.iter % 1 == 0:
            print("------------------------------------------------")
            print("epoch", self.iter, "\nloss : {:.015f}".format(loss))

        if self.iter % 500 == 0:
            torch.save(
                {
                    "epoch": self.iter,
                    "model_state_dict": self.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "loss": loss,
                },
                "model_for_quetion_site.pth"
            )
        return loss

LBFGS_epochs = 100000000

layers = np.array([5,20,20,20,20,20,20,20,20,2])
NN = Sequentialmodel(layers).to(device)

if switch == 1:
    checkpoint = torch.load("model_for_quetion_site.pth", map_location=device)
    NN.load_state_dict(checkpoint["model_state_dict"], strict=False)
    print(NN.load_state_dict(checkpoint["model_state_dict"]))

optimizer = torch.optim.LBFGS(NN.parameters(),lr=1, 
                              max_iter = LBFGS_epochs, 
                              #max_eval = 100000, 
                              tolerance_grad = 1e-10, 
                              tolerance_change = 1e-10, 
                              history_size = 100, 
                              line_search_fn = 'strong_wolfe')


if switch == 1:
    NN.iter = checkpoint["epoch"]
    NN.loss_hist = checkpoint["loss"]
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

    for state in optimizer.state.values():
        for k, v in state.items():
            if torch.is_tensor(v):
                state[k] = v.cuda()

    NN.loss_hist = np.load("loss.npy", allow_pickle=True).tolist()

optimizer.step(NN.closure)

Based on the error message it seems prev_flat_grad is None in this line of code. I tried to simplify your code a bit, but cannot reproduce the issue using a simple model (however, your code reproduces it after fixing a few undefined variables).
I’m not familiar enough with this optimizer, but maybe @albanD would know more if and when prev_flat_grad is part of the optimizer.state as it seems optimizer.state[optimizer._params[0]]["prev_flat_grad"] is empty.

Ho that is weird…
LBFGS is not used too much so there might be a bug there :confused: Could you open an issue about this please?