Hi, running the model with the code bellow gives me a memory leak when i’m running on CPU.
I created a fake dataloader to remove it from the possible causes.
At each batch, Ram is slightly increasing until it reaches full capacity an the process is killed.
It’s even worse when I add the other part of my network (generator and discriminator based on the same blocks)
I tried with/without backprop or optimizer the problem is still the same. When I try to do backprop twice I get the usual error, which means the graph is freed.
I guess it’s comming because of some variable in the graph which is not freed. But after reading many times the code I cannot find where…
Thanks in advance for your help.
class testLoader(Dataset):
def __init__(self):
super(testLoader, self).__init__()
def __getitem__(self, index):
return (torch.rand(3, 224, 224), torch.rand(3, 224, 224),
torch.rand(6, 224, 224), torch.randint(0, 118, (1,)))
def __len__(self):
return 36237
datas = testLoader()
train_loader = DataLoader(datas, batch_size=2,
shuffle=True, num_workers=8)
torch.autograd.set_detect_anomaly(True)
optimizerEmb = Adam(emb.parameters(), lr=0.001)
for i_epoch in range(2):
for i_batch, batch in enumerate(train_loader):
gt_im, gt_landmarks, context_tensors, itemIds = batch
optimizerEmb.zero_grad()
embeddings = emb(context_tensors)
# Tried with a dummy backprop :
loss = torch.norm(embeddings)
loss.backward()
# Tried optimizer step as well
optimizerEmb.step()
Here emb is this model :
class Embedder(nn.Module):
def __init__(self):
super(Embedder, self).__init__()
self.residual1 = ResidualBlockDown(6, 64, norm=True, learn=False)
self.residual2 = ResidualBlockDown(64, 128, norm=True, learn=False)
self.residual3 = ResidualBlockDown(128, 256, norm=True, learn=False)
self.residual4 = ResidualBlockDown(256, 512, norm=True, learn=False)
self.residual5 = ResidualBlockDown(512, 512, norm=True, learn=False)
self.attention = Attention(128)
def forward(self, x): # b, 12, 224, 224
out = self.residual1(x) # b, 64, 112, 112
out = self.residual2(out) # b, 128, 56, 56
out = self.attention(out) # b, 128, 56, 56
out = self.residual3(out) # b, 256, 28, 28
out = self.residual4(out) # b, 512, 14, 14
out = self.residual5(out) # b, 512, 7, 7
out = torch.sum(out.view(out.size(0), out.size(1), -1), dim=2) # b,512
return out
My attention class (spectral norm is the one from pytorch nn.utils.spectral_norm)
class Attention(nn.Module):
def __init__(self, in_channels):
super(Attention, self).__init__()
self.convF = spectral_norm(nn.Conv2d(in_channels, in_channels,
kernel_size=1, padding=0,
stride=1, bias=False))
self.convG = spectral_norm(nn.Conv2d(in_channels, in_channels,
kernel_size=1, padding=0,
stride=1, bias=False))
self.convH = spectral_norm(nn.Conv2d(in_channels, in_channels,
kernel_size=1, padding=0,
stride=1, bias=False))
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
f = self.convF(x)
g = self.convG(x)
h = self.convH(x)
attn_map = self.softmax(torch.matmul(f, g))
attn = torch.matmul(h, attn_map)
return x + attn
Finally the resBlockDown :
class ResidualBlockDown(nn.Module):
def __init__(self, in_channels, out_channels, norm=True, learn=True):
super(ResidualBlockDown, self).__init__()
self.norm = norm
self.conv1 = spectral_norm(nn.Conv2d(in_channels, out_channels,
kernel_size=3, padding=1, bias=False))
self.conv2 = spectral_norm(nn.Conv2d(out_channels, out_channels,
kernel_size=3, padding=1, bias=False))
self.adaDim = spectral_norm(nn.Conv2d(in_channels, out_channels,
kernel_size=1, bias=False))
self.relu = nn.ReLU()
self.avgPool = nn.AvgPool2d(kernel_size=2)
self.in1 = nn.InstanceNorm2d(out_channels, affine=learn)
self.in2 = nn.InstanceNorm2d(out_channels, affine=learn)
def forward(self, x):
residual = self.avgPool(self.adaDim(x))
out = self.conv1(x)
if self.norm:
out = self.in1(out)
out = self.relu(out)
out = self.conv2(out)
if self.norm:
out = self.in2(out)
out = self.relu(out)
out = self.avgPool(out)
out += residual
return out```