Hello,
I’ve got my model set up, but I seem to have made some sort of mistake as the RAM (VRAM is stable) is ever-increasing (around 4.2 - 4.4 GB per epoch).
I’ve looked at similar posts in this forum and tried using the information I found to identify and stop the memory leak. The memory leak occurs at the training step as uncommenting the validation step doesn’t change the speed at which the memory fills. Additionally, my validation step is wrapped in a @torch.no_grad()
decorator.
While I am storing certain output metrics in a list, I made sure that it is detached from the graph before storing. I also added tried adding explicit del
commands as well as calling the garbage collector, but it didn’t help.
I programmed a little logging tool:
def log(epoch, fname):
count = 0
for obj in gc.get_objects():
try:
if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
count += 1
except:
pass
perc_used = psutil.virtual_memory()[2]
var_num = len(list(locals().items())) + len(list(globals().items()))
with open(fname, 'a') as log_file:
log_file.write("{0},{1},{2},{3}\n".format(epoch, perc_used, count, var_num))
which showed me that while the RAM usage increases linearly with every epoch (~4.2-4.4 GB per epoch) until my 32 GB are full, the number of torch tensors and python variables stays the same (2517 and 21 respectively).
The relevant part of my code looks like this:
for epoch in range(module.current_epoch, self.max_epochs):
# Train
train_output = self.train(train_dataloader, module, optimizer, pointcloud)
log(epoch, "memory_log_{}.txt".format(timestr))
# Validation
validation_output = self.validate(val_dataloaders, module)
# Check and save model
if 'poses' in train_output:
self.check_and_save(module, validation_output, train_output=train_output)
else:
self.check_and_save(module, validation_output)
# Update current epoch
module.current_epoch += 1
# Take a scheduler step
scheduler.step()
# delete validation and train output
del train_output
del validation_output
def train(self, dataloader, module, optimizer, pointcloud=None):
# Set module to train
module.train()
# Shuffle dataloader sampler / update epoch
if hasattr(dataloader.sampler, "set_epoch"):
dataloader.sampler.set_epoch(module.current_epoch)
# Prepare progress bar
progress_bar = self.train_progress_bar(
dataloader, module.config.datasets.train)
# Start training loop
outputs = []
# save pose file name
pose_file = None
for i, batch in progress_bar:
# Reset optimizer
optimizer.zero_grad()
if module.model.train_requirements['gt_pose']:
# check if pose file is still the same, load new pose otherwise
if batch['pose_file'][0] != pose_file:
if 'gpu_pose' in locals():
del gpu_pose
pose_file = batch['pose_file'][0]
gpu_pose = batch['full_pose'].to('cuda')
del batch['full_pose']
# Send samples to GPU and take a training step
batch = sample_to_cuda(batch)
if module.model.train_requirements['gt_pose']:
batch.update({'full_pose': gpu_pose})
if pointcloud is not None:
# add reference to pointcloud on GPU
batch.update({'pointcloud': pointcloud})
output_gpu = module.training_step(batch,
sampling_alg = module.config.datasets.pretrain.sampling_alg,
pixels_per_frame = module.config.datasets.pretrain.num_pixels_per_frame)
else:
output_gpu = module.training_step(batch)
# Backprop through loss and take an optimizer step
# integrate
output_gpu['loss'].backward()
optimizer.step()
output = {}
output['loss'] = output_gpu['loss'].to('cpu').detach()
output['metrics'] = {}
output['metrics']['photometric_loss'] = output_gpu['metrics']['photometric_loss'].to('cpu').detach()
output['metrics']['smoothness_loss'] = output_gpu['metrics']['smoothness_loss'].to('cpu').detach()
output['poses'] = [x.to('cpu').detach() for x in output_gpu['poses']]
del output_gpu
del batch
# Append output to list of outputs
outputs.append(output)
# Update progress bar if in rank 0
if self.is_rank_0:
progress_bar.set_description(
'Memory used: {}% | Epoch {} | Avg.Loss {:.4f}'.format(
memory_used(), module.current_epoch, self.avg_loss(output['loss'].item())))
# Return outputs for epoch end
return module.training_epoch_end(outputs)
What can I do to further investigate the issue?