Creating redundant variables in forward pass does not incur memory costs?

Here I intentionally added another dummy variable, but memory shown using torch.cuda.memory_allocated is the same before and after adding.
I’ve also observed that using del to delete used variables doesn’t reduce memory usage.

import torch
import torch.nn as nn
import torch.nn.functional as F
# from pytorch_wavelets import Learnable2D
import math
import os
import sys
import pdb
import pywt
from pytorch_wavelets import DWTForward, DWTInverse

class volumeNet(nn.Module):
	def __init__(self, nlevel, wave, inchannel, outchannel, learnable_wave, transform, mode):
		super(volumeNet, self).__init__()
		self.nlevel = nlevel
		self.wave = wave
		self.mode = mode
		self.learnable_wave = learnable_wave
		
		self.transform = transform                                          
		self.inchannel = inchannel
		self.outchannel = outchannel
		print("model inchannel:", inchannel)
		print("model outchannel:", outchannel)

		self.approx_conv1 = nn.Conv3d(inchannel, 64, 3, 1, padding='same')
		self.approx_conv2 = nn.Conv3d(64, 128, 3, 1, padding='same')
		self.approx_conv3 = nn.Conv3d(128, 128, 3, 1, padding='same')
		self.approx_conv4 = nn.Conv3d(128, 64, 3, 1, padding='same')
		self.approx_conv5 = nn.Conv3d(64, outchannel, 3, 1, padding='same')

	def forward(self, x, verbose=False, autocast=False):
		device = "cuda" if x.is_cuda else "cpu"
		with torch.autocast(device, enabled=autocast):
			return self._forward(x)

	def _forward(self, x):
		la = x
		la = self.approx_conv1(la)
		# la = F.relu(la)
		la = torch.sin(la)
		la = self.approx_conv2(la)
		# la = F.relu(la)
		la = torch.sin(la)
		la = self.approx_conv3(la)
		# la = F.relu(la)
		la = torch.sin(la)
		la = self.approx_conv4(la)
		# la = F.relu(la)
		dummy = torch.sin(la)
		dummy = self.approx_conv5(dummy)
		signal = dummy
		breakpoint()
		return signal

Which redundant variables are you referring to?
According to the forward method you are re-assigning the intermediates to la and later dummy.
Are you expecting to see a difference if you would replace dummy with la? If so, then note that Autograd will keep variables alive if these are needed for the gradient calculation in the backward call.

Thanks for replying. Yes I’m expecting to see an increase with the addition of dummy, but that seemed not the case.