RuntimeError: Trying to backward through the graph a second time when using dataloader

I built a small model

class TinyVGG(nn.Module):
    def __init__(self,input_features,
                 output_features,
                 hidden_units,
                 len_classes) -> None:
        super().__init__()

        self.conv_relu_layer_1 = nn.Sequential(
            nn.Conv2d(in_channels=input_features,
                      out_channels=output_features,
                      kernel_size=3,
                      padding=1,
                      stride=1),
            nn.ReLU()
        )
        self.conv_relu_maxpool_1 = nn.Sequential(
            nn.Conv2d(in_channels=output_features,
                      out_channels=output_features,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3)
        )
        self.conv_relu_layer_2 = nn.Sequential(
            nn.Conv2d(in_channels=output_features,
                      out_channels=output_features,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.ReLU()
        )
        self.conv_relu_maxpool_2 = nn.Sequential(
            nn.Conv2d(in_channels=output_features,
                      out_channels=output_features,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3)
        )
        self.classfier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=hidden_units*3*3,
                      out_features=len_classes)
        )
    def forward(self, x):
        x = self.conv_relu_layer_1(x)
        x = self.conv_relu_maxpool_1(x)
        x = self.conv_relu_layer_2(x)
        x = self.conv_relu_maxpool_2(x)
        x = self.classfier(x)

        return x

Tried to run it with

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=tinyvgg_GPU.parameters(),
                             lr=0.01)

epochs = 5

for epoch in range(epochs):
    accuracy=0; loss=0
    for data, target in train_dataloader:
        tinyvgg_GPU.train()
        
        data = data.to(device)
        target = target.to(device)
    
        y_logits = tinyvgg_GPU(data)
        
        loss += loss_fn(y_logits, target)

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()
    
    loss = loss/len(train_dataloader)

    tinyvgg_GPU.eval()
    with torch.inference_mode():
        test_loss = 0; test_accuracy = 0
        for data, target in test_dataloader:

            data = data.to(device)
            target = target.to(device)

            test_logits = tinyvgg_GPU(data)

            test_loss += loss_fn(test_logits, target)

        test_loss = test_loss/len(test_dataloader)

    print(f"Epoch: {epoch} | loss: {loss} | test_loss : {test_loss}")

This throwed me:
RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.
I found a reference model online

from torch import nn 
class MNIST_model(torch.nn.Module):
  """Model capable of predicting on MNIST dataset.
  """
  def __init__(self, input_shape: int, hidden_units: int, output_shape: int): 
    super().__init__()
    self.conv_block_1 = nn.Sequential(
      nn.Conv2d(in_channels=input_shape, 
                out_channels=hidden_units,
                kernel_size=3,
                stride=1,
                padding=1), 
      nn.ReLU(),
      nn.Conv2d(in_channels=hidden_units,
                out_channels=hidden_units,
                kernel_size=3,
                stride=1,
                padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2)
    )
    self.conv_block_2 = nn.Sequential(
      nn.Conv2d(in_channels=hidden_units,
                out_channels=hidden_units,
                kernel_size=3,
                stride=1,
                padding=1),
      nn.ReLU(),
      nn.Conv2d(in_channels=hidden_units,
                out_channels=hidden_units,
                kernel_size=3,
                stride=1,
                padding=1),
      nn.ReLU(),
      nn.MaxPool2d(kernel_size=2)                   
    )
    self.classifier = nn.Sequential(
      nn.Flatten(),
      nn.Linear(in_features=hidden_units*7*7,
                out_features=output_shape)
    )

  def forward(self, x):
    x = self.conv_block_1(x)
    x = self.conv_block_2(x)
    x = self.classifier(x)
    return x

and to train it

# %%time
from tqdm.auto import tqdm 

device = "cuda" if torch.cuda.is_available() else "cpu"

# Train on GPU
model_gpu = MNIST_model(input_shape=1,
                        hidden_units=10,
                        output_shape=10).to(device)

# Create a loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_gpu.parameters(), lr=0.1)

# Training loop
epochs = 5
for epoch in tqdm(range(epochs)):
  train_loss = 0
  model_gpu.train()
  for batch, (X, y) in enumerate(train_dataloader):
    # Put data on target device
    X, y = X.to(device), y.to(device)

    # Forward pass
    y_pred = model_gpu(X)

    # Loss calculation
    loss = loss_fn(y_pred, y)
    train_loss += loss

    # Optimizer zero grad
    optimizer.zero_grad()

    # Loss backward
    loss.backward()

    # Step the optimizer
    optimizer.step()
  
  # Adjust train loss to number of batches
  train_loss /= len(train_dataloader)

  ### Testing loop
  test_loss_total = 0
  # Put model in eval mode and turn on inference mode
  model_gpu.eval()
  with torch.inference_mode():
    for batch, (X_test, y_test) in enumerate(test_dataloader):
      # Make sure test data on target device
      X_test, y_test = X_test.to(device), y_test.to(device)
      
      test_pred = model_gpu(X_test)
      test_loss = loss_fn(test_pred, y_test)

      test_loss_total += test_loss

    # Adjust test loss total for number of batches
    test_loss_total /= len(test_dataloader)
  
  # Print out what's happening
  print(f"Epoch: {epoch} | Loss: {train_loss:.3f} | Test loss: {test_loss_total:.3f}")

The reference model works without any problem, I don’t understand the problem here. Can someone please explain. Thanks in advance.

I have solved the problem, well changing

loss += loss_fn(y_logits, target)

to

loss = loss_fn(y_logits, target) 
train_loss += loss

solved, can someone explain the difference please.

loss += loss_fn(...) will accumulate the loss tensor with all its history and thus will also keep the computation graph alive. Calling loss.backward() will then try to backpropagate through all iterations and will raise the error since intermediate activations from previous iterations were already freed.

Your new code still accumulates loss in train_loss and will thus keep all computation graphs alive and increase the memory usage. If you want to track the loss value alone, use train_loss += loss.detach() or loss.item().

1 Like