Pytorch as an optimizer for parameter estimation.. but I got "backward through the graph a second time" I can't resolve. With a toy example

Hello,

I tried to use Pytorch as an optimization for parameter estimation.

I have a vector of parameter theta=[C1,C2,R1,R2].
2x2 Tensor A and 2x1 Tensor B are created. C1,C2,R1, and R2 are assigned to A and B tensors.

When I run the model I got “Trying to backward through the graph a second time … Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.”

I tried several things such as scatter_mask… but, I can’t figure out what’s wrong with this.


# Input data u and output data y
y=np.array([0.        , 0.        , 0.01158425, 0.02643878, 0.04200459,
       0.05750639, 0.07272101, 0.08759547, 0.10212857, 0.11633427,
       0.13023035, 0.14383496, 0.15716559, 0.17023881, 0.18307024,
       0.19567455, 0.20806552, 0.22025612, 0.23225851, 0.24408412,
       0.25574368, 0.26724727, 0.27860436, 0.28982385]).reshape(-1,1)

u=(np.arange(y.shape[0])/10).reshape(-1,1)

# model
class TEST(nn.Module):
    def __init__(self,theta_min,theta_max):
        super().__init__()

        self.n_x=2
        self.n_y=1
        self.n_u=1
        n_theta=theta_min.shape[0]
        # initialize weights with random numbers   
        # parameter is bounded by tanh and scaled back to original scale.     
        s_theta_ = torch.distributions.Uniform(-1,1).sample((n_theta,)) 
        self.s_theta=nn.Parameter(torch.tanh(s_theta_))
        self.theta=(self.s_theta*(theta_max-theta_min)+theta_min)
        self.x0=torch.zeros([self.n_x])
    
    def forward(self, u,y):
        """
       forward (including model)
        """
        
        theta=self.theta
        C1=theta[0]
        C2=theta[1]
        R1=theta[2]
        R2=theta[3]
        # create A and B matrix.
        A=torch.zeros((self.n_x,self.n_x))
        B=torch.zeros((self.n_x,self.n_u))
        A[0,0]=-1/(C1*R1)
        A[0,1]=1/(C1*R1)
        A[1,0]=-1/(C2*R1)
        A[1,1]=-1/(C2*R1)-1/(C2*R2)
        B[1,0]=1/(C2*R2)
        
        Ad=torch.matrix_exp(A*1)
        Bd=torch.matmul(torch.matmul(torch.inverse(A),(Ad-torch.eye(self.n_x))),B)
        
        n_tk=u.shape[0]
        #NRTU=self.NRTU
        
        xhatk=self.x0
        
        xhatk_list=[]
        # prediction y
        for tk in torch.arange(1,n_tk+1):
            xhatk_list.append(xhatk.unsqueeze(0))
            xhatk=torch.matmul(Ad,xhatk)+torch.matmul(Bd,u[tk-1,:]) #x[t+1]=Ax[t]+Bu[t]
        xhatk_out=torch.cat(xhatk_list,dim=0)
        # xhatk_out[:,1] is prediction of y
        return xhatk_out

# training 
theta_min=np.array([0.1, 0.1, 0.1, 0.1]) #min of theta
theta_max=np.array([600, 600, 50, 50]) #max of theta
theta_min=torch.tensor(theta_min,dtype=torch.float32)
theta_max=torch.tensor(theta_max,dtype=torch.float32)

# model
mm=TEST(theta_min=theta_min,theta_max=theta_max)
device=torch.device('cpu')
mm=mm.to(device)

optimizer = torch.optim.Adam(mm.parameters(), lr=0.01)
scheduler=optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

n_epochs=1000
for epoch in range(1, n_epochs + 1):
    loss_train = 0.0
    
    tu=torch.tensor(u,dtype=torch.float32).to(device=device)  # <1>
    ty=torch.tensor(y,dtype=torch.float32).to(device=device)  # <1>
        
    #torch.autograd.set_detect_anomaly(True)
    out=mm.forward(u=tu,y=ty)
    
    loss = loss_func(out[:,1],ty.flatten())     # must be (1. nn output, 2. target)
        
    optimizer.zero_grad()   # clear gradients for next train
#    loss.backward(retain_graph=True)
    loss.backward(retain_graph=False)
        
        #outputs = model(imgs)
        #loss = loss_fn(outputs, labels)
    optimizer.step()
        
    loss_train += loss.item()
    print(loss.item())
if epoch%100==0:
    scheduler.step()

``


This results in..
Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.


I am not sure if this is the issue of system creation...any links/suggestions/ or some keywords will be helpful. Thanks!!

Parameters C and R are merged again to get Bd even they have their own graph.

I think the code works by requires_grad=False of C and R but is this what you want?