GPU Memory increases after each fold

After each fold, my GPU memory increases, this is the code I used:

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader,Dataset
import numpy as np
from tqdm import tqdm

class TestDataset(Dataset):
    def __init__(self, X, y, weights=None):
        self.X = X
        self.y = y
        self.weights = weights
    def __len__(self):
        return len(self.X)
    def __getitem__(self, i):
        ret = dict(X=self.X[i], y=self.y[i])
        if self.weights is not None:
            ret.update(dict(weights=self.weights[i]))
        return ret

class MyNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv2d(128, 64, kernel_size=3, stride=1, padding=1)
        self.conv6 = nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1)
        self.conv7 = nn.Conv2d(32, 3, kernel_size=3, stride=1, padding=1)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        return x

X = np.random.rand(100, 3, 512, 512).astype(np.float32)
y = np.random.randint(3, size=(100, 512, 512))
weights = np.ones_like(y).astype(np.float32)

If I don’t provide the weight maps, and create it when needed via batch.get("…") then after each iteration, the memory usage increases

for _ in range(10):
    model = MyNet().cuda()
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.NLLLoss(reduce=False)
    train_dl = DataLoader(TestDataset(X, y), batch_size=1)
    for epoch in range(1):
        for batch in tqdm(train_dl):
            inp = Variable(batch['X']).cuda()
            pred = F.log_softmax(model(inp), dim=1)
            target = Variable(batch['y']).long().cuda()
            loss = criterion(pred, target.squeeze(1))
            weights_ = batch.get("weights", torch.ones_like(loss)).cuda()
            optimizer.zero_grad()
            torch.autograd.backward(loss, weights_)
            optimizer.step()

But when the weight maps are able to be loaded from the dataset everything is fine:

for _ in range(10):
    model = MyNet().cuda()
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.NLLLoss(reduce=False) 
    train_dl = DataLoader(TestDataset(X, y, weights=weights), batch_size=1)
    for epoch in range(1):
        for batch in tqdm(train_dl):
            inp = Variable(batch['X']).cuda()
            pred = F.log_softmax(model(inp), dim=1)
            target = Variable(batch['y']).long().cuda()
            loss = criterion(pred, target.squeeze(1))        
            weights_ = batch.get("weights", torch.ones_like(loss)).cuda()
            optimizer.zero_grad()
            torch.autograd.backward(loss, weights_)
            optimizer.step()

I’m wondering what is causing this to happen? I’m using Python 3.6.4 and Pytorch 0.3.1.post2

1 Like

Hi,

I ran both your code samples with both python 2.7.13 and 3.6.5 with pytorch compiled from master. So this possibly has already been fixed in master.
I guess a temporary fix would be to use an if statement to check if "weights" is in the keys of batch, or make your dataloader always fill this field, even if it’s a a tensor full of 1s.