RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed

My custom Model looks like this where I have matrices to be sent through conv layers and Im using those weight matrices to calculate a loss function.

class MatrixModel(nn.Module):
  def __init__(self, num_matrices=10,layers = 10,img_shape=(32,32),lamda=1.0) -> None:
    super().__init__()
    self.num_matrices = num_matrices
    self.layers = layers
    self.l = lamda
    self.img_shape = img_shape
    self.modules = []
    self.W = torch.randn((num_matrices,img_shape[1],img_shape[1]))
    for i in range(layers):
      self.modules.append(nn.Conv2d(self.num_matrices,self.num_matrices,kernel_size=3,stride=1,padding=1))
    self.model = nn.Sequential(*self.modules)

  def calculate_loss(self,X,X_w,W,lamda=1.0):
    X_w = X - X_w
    X_w = torch.einsum("ijk,ijk->ijk",X_w,X_w)
    X_w = torch.einsum("ijk->",X_w)
    X_w = 0.5*X_w
    W = torch.einsum("ijk,ijk->ijk",W,W)
    W = torch.einsum("ijk->jk",W)
    W = torch.sqrt(W)
    W = torch.einsum("jk->",W)
    W = lamda*W
    return W + X_w

  def forward(self,x):
    self.W = torch.unsqueeze(self.W,0)
    self.W = self.model(self.W)
    x = torch.mean(x,dim=1)
    self.x_expanded = x.unsqueeze(1)
    self.X_w = torch.matmul(self.x_expanded, self.W).sum(dim=1)
    self.W = torch.squeeze(self.W)
    loss = self.calculate_loss(x,self.X_w,self.W,self.l)
    return loss

My training code looks like this

model = MatrixModel(num_matrices=num_matrices,layers=layers,img_shape=(32,32),lamda=lamda)
optimizer = optim.SGD(model.parameters(),lr = 1)

for i in range(epochs):
  model.train()
  train_running_loss = 0.0
  test_running_loss = 0.0
  best_test_loss = float('inf')
  for i,(img,tar) in enumerate(train_loader):
    optimizer.zero_grad()
    img = img.to(device)
    loss = model(img)
    loss.backward()
    optimizer.step()

I get the following error when i run this code, and if i include loss.backward(retain_graph=True) I get the following error
" RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [10, 10, 3, 3]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True)."

What is the solution for this??

It seems you are assigning some intermediates to self, which might accumulate the computation graph. I’m not in front of my workstation so cannot check which variable is causing the issue, but you could try to detach them before the assignment.

I replaced some of the self intialisations, and the current code looks like this

class MatrixModel(nn.Module):
  def __init__(self, num_matrices=10,layers = 10,img_shape=(32,32),lamda=1.0) -> None:
    super().__init__()
    self.l = lamda
    modules = []
    self.W = torch.randn((num_matrices,img_shape[1],img_shape[1]))
    for i in range(layers):
      modules.append(nn.Conv2d(num_matrices,num_matrices,kernel_size=3,stride=1,padding=1))
    self.model = nn.Sequential(*modules)

  def calculate_loss(self,X,X_w,W,lamda=1.0):
    X_w = X - X_w
    X_w = torch.einsum("ijk,ijk->ijk",X_w,X_w)
    X_w = torch.einsum("ijk->",X_w)
    X_w = 0.5*X_w
    W = torch.einsum("ijk,ijk->ijk",W,W)
    W = torch.einsum("ijk->jk",W)
    W = torch.sqrt(W+1e-6)
    W = torch.einsum("jk->",W)
    W = lamda*W
    return W + X_w

  def forward(self,x):
    W_exp = torch.unsqueeze(self.W,0)
    W_exp = torch.nan_to_num(W_exp,nan=0.0)
    W_exp = self.model(W_exp)
    W_exp = torch.nan_to_num(W_exp,nan=0.0)
    x = torch.mean(x,dim=1)
    x_expanded = x.unsqueeze(1)
    X_w = torch.matmul(x_expanded, W_exp).sum(dim=1)
    self.W = torch.squeeze(W_exp)
    loss = self.calculate_loss(x,X_w,W_exp.squeeze(),self.l)
    return loss

But I’m getting this error when i run using torch.set_anomaly_detection(True).
RuntimeError: Function ‘ConvolutionBackward0’ returned nan values in its 1th output. I have replaced all the nan values before i pass them to the conv layers. What could be the issue here??

Would really appreciate a solution for this

Calling nan_to_num won’t fix issues in the backward pass is the forward is already broken:

x = torch.zeros(2, requires_grad=True)
output = torch.log(x)
output = torch.log(output)
output = torch.nan_to_num(output, nan=0.0)
output.mean().backward()
print(x.grad)
# tensor([nan, nan])

But I’m passing a tensor which has no nan values to the convolutional filters, so why us this happening??

I don’t know and you would need to narrow down which operation creates the NaN outputs e.g. by checking intermediate activations via torch.isfinite(x).all().

Thanks Peter, one small question, how to ensure that there are no nan’s in the forward pass??

You could use my proposed approach by checking the input as well as all intermediate activations via torch.isfinite(x).all().

Thanks a lot Peter, for the quick reply