I am using PyTorch (v1.4.0) for an experiment with the CLEVR dataset. I kept having problems with the GPU running out of memory. My code is running on a V100 GPU with 32GB of memory. In order to pinpoint where the memory leak was coming from, I kept reducing my code (see below).
In summary, my code creates the data loader and loops over all the samples (batch size = 1), but does nothing except print the index of the sample. Additionally, the sample is explicitly deleted and I call torch.cuda.empty_cache()
.
After about 17.000 samples, the GPU’s memory is completely full. How is this possible? It seems the samples are not properly cleared from memory. As far as I can tell, I do not keep any tensors in memory.
def main(args):
# Make the dataloader
loader_kwargs = {
'questions': args.questions_json,
'images_dir': args.images_dir,
'device': args.device,
'max_samples': args.max_samples,
'shuffle': args.shuffle_data == 1,
'batch_size': 1,
'num_workers': args.num_workers,
'collate_fn': clevr_collate_fn
}
data_loader = get_evaluation_dataloader(**loader_kwargs)
with torch.no_grad():
# This function loads .pt files
# These take 270MB of GPU memory
modules = get_evaluation_modules()
# Network holds the modules in a dictionary
# but is currently not being used
network = Network(modules)
evaluator = Evaluator(args, network, data_loader)
evaluator.run()
def get_evaluation_dataloader(**kwargs):
dataset_kwargs = {
'questions': kwargs.pop('questions'),
'images_dir': kwargs.pop('images_dir'),
'device': kwargs.pop('device')
}
ds = CLEVRDataset(**dataset_kwargs)
max_samples = kwargs.pop('max_samples')
if max_samples is not None:
all_indices = list(range(0, len(ds)))
subset = random.sample(all_indices, max_samples)
ds = Subset(ds, subset)
loader = DataLoader(dataset=ds, **kwargs)
return loader
class CLEVRDataset(Dataset):
def __init__(self, questions, images_dir, device, *args, **kwargs):
super(CLEVRDataset, self).__init__(*args, **kwargs)
self.images_dir = images_dir
self.device = device
with open(questions, 'r') as f:
data = json.load(f)
self.questions = data['questions']
self.image2tensor = ImageToTensor(device)
def __getitem__(self, idx):
entry = self.questions[idx]
# get the image
img_filename = entry['image_filename']
img_path = os.path.join(self.images_dir, img_filename)
image = io.imread(img_path)
image = self.image2tensor(image)
# add the image to entry
entry['image'] = image
return entry
def __len__(self):
return len(self.questions)
def clevr_collate_fn(entries):
batch = {}
for key in entries[0].keys():
if type(entries[0][key]) == torch.Tensor:
batch[key] = torch.stack([entry[key] for entry in entries])
else:
batch[key] = [entry[key] for entry in entries]
return batch
class Evaluator(object):
def __init__(self, args, network, data_loader):
self.network = network
self.data_loader = data_loader
def run(self):
print("===== Start evaluation =====")
for i, sample in enumerate(self.data_loader):
print(i)
del sample
torch.cuda.empty_cache()
print("===== Finished evaluation =====")