Variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor [1]]

Hi, I have experience with PyTorch, but it’s the first time I’m generating values of stochastic variables by recursion, and including initial values in the optimizer. The procedure roughly follows the paper [1709.05963] Machine learning approximation algorithms for high-dimensional fully nonlinear partial differential equations and second-order backward stochastic differential equations, with some differences.

import torch
import numpy as np
import scipy.integrate as integrate
import matplotlib.pyplot as plt

name = 'Merton' # 'Merton', 'Allen-Cahn', 'Black-Scholes', 'HJB'

# Define variables
T = 1
timeDim = 2
spaceDim = 1
d = max(5, spaceDim)
if name == 'Merton':
    mu = 0.3
    r = 0.05
    sigma = 0.4
    eta = 1.0
    coeff = (mu-r)/sigma # a scalar term that arises frequently in the code
n = 20
h = T/n
sqrt_h = float(np.sqrt(h))
n_maxstep = 10000
batchSize = 4
lrParam = 0.002

class NeuralNetwork(torch.nn.Module):
    def __init__(self, structure):
        super().__init__() = []
        for inFeature, outFeature in zip(structure, structure[1:]):
                torch.nn.Linear(inFeature, outFeature),
            ]) = torch.nn.Sequential(*

    def forward(self, x):

# neural network architectures
n_neuronForM = [timeDim + spaceDim, d, d, spaceDim**2] # U_yy
n_neuronForN = [timeDim + spaceDim, d, d, spaceDim] # U_sy + 1/2 U_yyy

n_neuronForU = [timeDim + spaceDim, d, d, 1]

modelM = NeuralNetwork(n_neuronForM)
modelN = NeuralNetwork(n_neuronForN)
modelU = NeuralNetwork(n_neuronForU)

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
    device = "cpu"

# Initial/Terminal Condition
def g(t, y): # y has [batchSize, n+1], but t is just i*h
    if name == 'Merton':
        return -torch.div(torch.exp(-eta*y),(1+r*(T-t)))

def w(t,s): # This is ok to be scalar function
    if name == 'Merton':
        return 1/(1+r*(s-t))

# Form of PDE
#     t, s, y, Uts, Uss, U_yts, U_yts, U_yyts, U_yyss
def F(t, s, X, Y1, Y2, Z1, Z2, M1, M2, name): # This needs to be a torch tensor function, except in t and s
    '''batch = X.size(dim=0)
    trajLength = X.size(dim=1)
    i = int(t/h)
    j = int(s/h)'''
    if name == 'Merton':
        y = 0.5*coeff**2*torch.mul(torch.square(torch.div(Z2,M2)),M1)-coeff**2*torch.mul(torch.div(Z2,M2),Z1)-w(t,s)*Y1-0.5*M1
    return y

# Comparison function, if we want one
def u(t,s,y,name): 
    if name == 'Merton':
        f = lambda t, tau: 0.5*coeff/sigma+w(t,tau)
        return np.exp(-integrate.quad(f,s,T,args = (t,))[0])*g(t,y)

#initial value for X0, initial guess for Y(0,0,X0) and Z(0,0,X0)
X0 = torch.ones(1)
Y0 = -torch.rand(1)
Z0 = torch.rand(1)

# All neural network input dimensions are [batchSize, timeDim + spaceDim]. The array of [t, s, y] goes in each row.
# The time and space substitution will be done by t=i*h, s=j*h, y=X(s)
# All neural network output dimensions are [batchSize, outputdim], output dim is either 1, spaceDim or spaceDim^2
def generateBSVIE(Y0, Z0):
    # Computing Yii and Zii
    dW = torch.normal(mean = 0.0, std = sqrt_h, size = [batchSize, n]) # [B, n]
    X = torch.ones((batchSize, n+1))*X0 #[B, n+1], representing X_0, X_1, ..., X_n
    for i in range(n):
        X[:,i+1] = X[:,i] + dW[:,i]
    Yii = torch.ones((batchSize, n+1, 1))*Y0
    Zii = torch.ones((batchSize, n+1, spaceDim))*Z0
    terminalValue = torch.zeros((batchSize, n+1))
    for b in range(batchSize):
        for i in range(n-1):
            tt =, 2))*i*h, X[:, i].reshape([batchSize,1])), 1)
            if spaceDim == 1:
                Mii = modelM(tt).reshape([batchSize, 1]) # T, spaceDim^2 as output
                Nii = modelN(tt).reshape([batchSize, 1]) # A, spaceDim as output
                Mii = modelM(tt) # T, spaceDim^2 as output
                Nii = modelN(tt) # A, spaceDim as output
            Yii[b,i+1] = Yii[b,i] + F(i*h, i*h, X[b,i], Yii[b,i,0], Yii[b,i,0], Zii[b,i,:], Zii[b,i,:], Mii[b,0], Mii[b,0], name)*h + dW[b,i]*Zii[b,i,:]
            Zii[b,i+1] = Zii[b,i] + Nii[b,:]*h + Mii[b,:]*dW[b,i]
            terminalValue[b,i] = g(i*h, X[b,-1])
    # Computing Yij and Zij
    for b in range(batchSize):
        YArray = torch.zeros((batchSize,n+1,n+1,1))
        ZArray = torch.zeros((batchSize,n+1,n+1,spaceDim))
        UArray = torch.zeros((batchSize,n+1,n+1,1))
        for i in range(n):
            YArray[b,i,i,0], ZArray[b,i,i,:] = Yii[b,i,0], Zii[b,i,:]
            for j in range(i,n):
                ts =,1))*i*h, torch.ones((batchSize,1))*j*h, X[:,j].reshape([batchSize,1])), 1)
                ss =,2))*j*h, X[:,j].reshape([batchSize,1])), 1)
                if spaceDim == 1:
                    Mij = modelM(ts).reshape([batchSize, 1])
                    Mjj = modelM(ss).reshape([batchSize, 1])
                    Nij = modelN(ts).reshape([batchSize, 1])
                    Njj = modelN(ss).reshape([batchSize, 1])
                    UArray[b,i,j,0] = modelU(ts)[b,0]
                    Mij = modelM(ts)
                    Mjj = modelM(ss)
                    Nij = modelN(ts)
                    Njj = modelN(ss)
                    UArray[b,i,j,0] = modelU(ts)[b,0]
                YArray[b,i,j+1,0] = YArray[b,i,j,0] - F(i*h, j*h, X[b,j], YArray[b,i,j,0], YArray[b,j,j,0], ZArray[b,i,j,:], ZArray[b,j,j,:], Mij[b,:], Mjj[b,:], name)*h+ZArray[b,i,j,:]*dW[b,j] # [B, 1]
                ZArray[b,i,j+1,:] = ZArray[b,i,j,:] + Nij[b,:]*h + Mij[b,:]*dW[b,j] # [B, spaceDim]
    return YArray, ZArray, UArray, terminalValue

Y0.requires_grad = True
Z0.requires_grad = True
YZLearningHistory = []
optimizer = torch.optim.Adam([Y0, Z0] + list(modelM.parameters()) + list(modelN.parameters()), lr = lrParam)

def computeLoss(Y0, Z0, mode):
    # loss computation
    if mode == 'XY':
        Y, Z, U, terminalValue = generateBSVIE(Y0, Z0)
        loss = torch.mean(torch.square(Y[:,:,n,0].reshape([batchSize, n+1])-terminalValue))
    elif mode == 'U':
        Y, Z, U, terminalValue = generateBSVIE(Y0, Z0)
        loss = torch.mean(torch.square(U-Y))
    return loss

for step in range(n_maxstep):
    optimizer.zero_grad() # sets all gradients back to 0
    currentLR = optimizer.param_groups[-1]["lr"]
    lossY = computeLoss(Y0,Z0, mode='XY')
    YZLearningHistory.append([step, lossY.item(), Y0.item(), currentLR])
    #print some things
    print(lossY, float(Y0))

I tried with and without “retain_graph=True” in lossY.backward(), the error message from that line comes out to be:

“RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.DoubleTensor [1]], which is output 0 of AsStridedBackward0, is at version 230; expected version 229 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!”

I suspect it could be the code for F, or the recursion process, that causes either Y0, Z0, or the parameters in modelM or modelN, but I am not certain. What might be the cause of the error?

you are allowed to do this:

import torch
x = torch.randn(10,10)
y = torch.randn(10,10)
l = torch.sum((x-y)**2)

this as well :

import torch
re = torch.randn(5,5)

x = torch.randn(10,10)
y = torch.randn(10,10)
s = (x-y)
s[:5,:5] = 0
l = torch.sum(s**2)


but not this:

import torch
re = torch.randn(5,5)

x = torch.randn(10,10)
y = torch.randn(10,10)
s = (x-y)
l1 = torch.sum(s**2)
s[:5,:5] = 0
l2 = torch.sum(s**2)
l = l1 +l2 
RuntimeError: one of the variables needed for gradient computation has been modified by an 
inplace operation: [torch.FloatTensor [10, 10]], which is output 0 of torch::autograd::CopySlices, 
is at version 1; expected version 0 instead. Hint: enable anomaly detection to find the operation
 that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In the last snippet, we compute the loss in two step. However, modifying s in-place can cause issues during backpropagation.
in-place operations on data needed for gradient computation can lead to unexpected behavior. It’s essential to avoid such operations when you need the computing of the gradients.