Hi
I am currently facing a memory leak issue when training multiple models with different architectures. I am using one DataLoader for every model, and sampling new student models with different architectures in a loop. The current student network is then trained in the training loop and evaluated.
I noticed that the memory leak occurs within the training loop which occupies more ressources at each iteration, though I am deleting the student architecture at the end…
Here is my outer loop where the student gets sampled :
while not self.controller.has_converged():
rollouts = []
logging.info("Iteration {}".format(iteration))
logging.info("Memory reserved early start: {}".format(torch.cuda.memory_reserved(self.gpu_id)))
for t in range(self.num_rollouts_per_iteration):
logging.info("Loading student...")
model_params, hp_params = self.controller.sample()
hp_state = [self.controller.hpspace.get_hyperparameters(tuple([int(h[0]), int(h[1])]))for h in hp_params]
hp_state = {'optimizer': hp_state[0][0], 'lr': hp_state[0][1]}
# sample new student here
student, model_dict, student_actions = self.controller.archspace[[int(i) for i in model_params]]
logging.info("Timestep {}".format(t))
logging.info("Training student {} for {} epochs".format(t, self.train_epochs))
logging.info("Student is: {}".format(student_actions))
logging.info("Memory reserved before training: {}".format(torch.cuda.memory_reserved(self.gpu_id)))
for epoch in range(self.train_epochs):
self._train_student(student, hp_state, epoch, self.train_epochs, student_id)
logging.info("Memory reserved after training: {}".format(torch.cuda.memory_reserved(self.gpu_id)))
logging.info("Evaluating student quality...")
acc_top1, acc_top5, cm = self._eval_student(student, student_id)
quality = acc_top1
# rollouts.append([model_params, hp_params, quality, student_actions])
logging.info("Student quality is {}".format(quality))
# free gpu
del model_params, hp_params
student = student.cpu()
del student
torch.cuda.empty_cache()
gc.collect()
logging.info("Memory reserved after cleanup: {}".format(torch.cuda.memory_reserved(self.gpu_id)))
And here is my “normal” training loop:
def _train_student(self, student, hyperparameters, epoch, max_epoch, student_id):
if self.gpu_id:
student.to(self.gpu_id)
# logging.info("Memory allocated in training loop: {}".format(torch.cuda.memory_allocated(self.gpu_id)))
student.train()
# logging.info("Memory allocated in training loop: {}".format(torch.cuda.max_memory_allocated(self.gpu_id)))
# sample optimizer
optimizer_fn = hyperparameters['optimizer']
optimizer_args = self.args.optimizer_args[(hyperparameters['optimizer']).__name__]
optimizer_args['lr'] = hyperparameters['lr']
optimizer = optimizer_fn(params=student.parameters(), **optimizer_args)
start_train_time = time()
num_top1, num_sample = 0, 0
train_iter = self.train_loader if self.no_progress_bar else \
tqdm(self.train_loader, leave=True, desc="Train student {}".format(student_id))
for num, (x, y, _) in enumerate(train_iter):
optimizer.zero_grad()
x = x.float().to(self.gpu_id)
y = y.long().to(self.gpu_id)
# put data through student
out, _ = student(x)
loss = self.loss_func(out, y)
loss.backward()
optimizer.step()
# Calculating accuracies
num_sample += x.size(0)
reco_top1 = out.max(1)[1]
num_top1 += reco_top1.eq(y).sum().item()
# Progress logging
...
# Zero out gradients and optimizer states
student.zero_grad()
optimizer.zero_grad()
optimizer.state.clear()
del loss, out, x, y
gc.collect()
torch.cuda.empty_cache()
# Showing Train Results
...
del optimizer
gc.collect()
torch.cuda.empty_cache()
And here my log file which shows, that after each epoch the different student keeps allocating more and more ressources on the GPU.
[ 2023-03-28 16:49:12,542 ] Iteration 0
[ 2023-03-28 16:49:12,542 ] Memory reserved early start: 2097152
[ 2023-03-28 16:49:12,542 ] Loading student...
[ 2023-03-28 16:49:12,556 ] Timestep 0
[ 2023-03-28 16:49:12,556 ] Training student 0 for 1 epochs
[ 2023-03-28 16:49:12,556 ] Student is: {'blocks_in': 3, 'depth_in': 3, 'stride_in': 1, 'blocks_main': 3, 'depth_main': 1, 'stride_main': 1, 'temp_win': 1, 'graph_dist': 1, 'expand_ratio': 3, 'reduct_ratio': 1, 'act': 'relu', 'att_lay': 'fa', 'conv_lay': 'Bottleneck', 'init_lay': 64, 'drop_prob': 0.25}
[ 2023-03-28 16:49:12,556 ] Memory reserved before training: 2097152
[ 2023-03-28 16:49:25,708 ] Epoch: 1/1, Training accuracy: 2/200(1.00%), Training time: 13.15s
[ 2023-03-28 16:49:25,708 ]
[ 2023-03-28 16:49:25,727 ] Memory reserved after training: 100663296
[ 2023-03-28 16:49:25,727 ] Evaluating student quality...
[ 2023-03-28 16:49:28,192 ] Top-1 accuracy: 4/200(2.00%), Top-5 accuracy: 21/200(10.50%), Mean loss:5.4947
[ 2023-03-28 16:49:28,192 ] Evaluating time: 2.46s, Speed: 162.29 sequnces/(second*GPU)
[ 2023-03-28 16:49:28,192 ]
[ 2023-03-28 16:49:28,201 ] Student quality is 0.02
[ 2023-03-28 16:49:28,239 ] Memory reserved after del: 2097152
[ 2023-03-28 16:49:28,239 ] Loading student...
[ 2023-03-28 16:49:28,248 ] Timestep 1
[ 2023-03-28 16:49:28,248 ] Training student 1 for 1 epochs
[ 2023-03-28 16:49:28,248 ] Student is: {'blocks_in': 3, 'depth_in': 3, 'stride_in': 1, 'blocks_main': 1, 'depth_main': 3, 'stride_main': 1, 'temp_win': 1, 'graph_dist': 1, 'expand_ratio': 1, 'reduct_ratio': 3, 'act': 'relu6', 'att_lay': 'fa', 'conv_lay': 'Sep', 'init_lay': 64, 'drop_prob': 0.2}
[ 2023-03-28 16:49:28,248 ] Memory reserved before training: 2097152
[ 2023-03-28 16:49:32,312 ] Epoch: 1/1, Training accuracy: 4/200(2.00%), Training time: 4.06s
[ 2023-03-28 16:49:32,312 ]
[ 2023-03-28 16:49:32,331 ] Memory reserved after training: 14680064
[ 2023-03-28 16:49:32,331 ] Evaluating student quality...
[ 2023-03-28 16:49:33,031 ] Top-1 accuracy: 1/200(0.50%), Top-5 accuracy: 17/200(8.50%), Mean loss:4.0946
[ 2023-03-28 16:49:33,031 ] Evaluating time: 0.70s, Speed: 571.85 sequnces/(second*GPU)
[ 2023-03-28 16:49:33,031 ]
[ 2023-03-28 16:49:33,035 ] Student quality is 0.005
[ 2023-03-28 16:49:33,065 ] Memory reserved after del: 2097152
[ 2023-03-28 16:49:33,065 ] Loading student...
[ 2023-03-28 16:49:33,076 ] Timestep 2
[ 2023-03-28 16:49:33,076 ] Training student 2 for 1 epochs
[ 2023-03-28 16:49:33,076 ] Student is: {'blocks_in': 3, 'depth_in': 1, 'stride_in': 1, 'blocks_main': 2, 'depth_main': 2, 'stride_main': 1, 'temp_win': 1, 'graph_dist': 1, 'expand_ratio': 1, 'reduct_ratio': 1, 'act': 'hardswish', 'att_lay': 'ca', 'conv_lay': 'Bottleneck', 'init_lay': 32, 'drop_prob': 0.15}
[ 2023-03-28 16:49:33,076 ] Memory reserved before training: 2097152
[ 2023-03-28 16:49:39,102 ] Epoch: 1/1, Training accuracy: 7/200(3.50%), Training time: 6.02s
[ 2023-03-28 16:49:39,102 ]
[ 2023-03-28 16:49:39,120 ] Memory reserved after training: 48234496
[ 2023-03-28 16:49:39,120 ] Evaluating student quality...
[ 2023-03-28 16:49:40,039 ] Top-1 accuracy: 3/200(1.50%), Top-5 accuracy: 11/200(5.50%), Mean loss:14.6428
[ 2023-03-28 16:49:40,039 ] Evaluating time: 0.92s, Speed: 435.81 sequnces/(second*GPU)
[ 2023-03-28 16:49:40,039 ]
[ 2023-03-28 16:49:40,045 ] Student quality is 0.015
[ 2023-03-28 16:49:40,072 ] Memory reserved after del: 2097152
[ 2023-03-28 16:49:40,072 ] Loading student...
[ 2023-03-28 16:49:40,083 ] Timestep 3
[ 2023-03-28 16:49:40,083 ] Training student 3 for 1 epochs
[ 2023-03-28 16:49:40,083 ] Student is: {'blocks_in': 1, 'depth_in': 2, 'stride_in': 1, 'blocks_main': 2, 'depth_main': 3, 'stride_main': 1, 'temp_win': 1, 'graph_dist': 1, 'expand_ratio': 1, 'reduct_ratio': 1, 'act': 'relu', 'att_lay': 'ja', 'conv_lay': 'Basic', 'init_lay': 64, 'drop_prob': 0.25}
[ 2023-03-28 16:49:40,083 ] Memory reserved before training: 2097152
[ 2023-03-28 16:49:44,795 ] Epoch: 1/1, Training accuracy: 2/200(1.00%), Training time: 4.71s
[ 2023-03-28 16:49:44,795 ]
[ 2023-03-28 16:49:44,813 ] Memory reserved after training: 96468992
[ 2023-03-28 16:49:44,813 ] Evaluating student quality...
[ 2023-03-28 16:49:45,793 ] Top-1 accuracy: 3/200(1.50%), Top-5 accuracy: 18/200(9.00%), Mean loss:4.1387
[ 2023-03-28 16:49:45,794 ] Evaluating time: 0.98s, Speed: 408.32 sequnces/(second*GPU)
[ 2023-03-28 16:49:45,794 ]
[ 2023-03-28 16:49:45,803 ] Student quality is 0.015
[ 2023-03-28 16:49:45,834 ] Memory reserved after del: 2097152
[ 2023-03-28 16:49:45,834 ] Loading student...
[ 2023-03-28 16:49:45,844 ] Timestep 4
[ 2023-03-28 16:49:45,844 ] Training student 4 for 1 epochs
[ 2023-03-28 16:49:45,844 ] Student is: {'blocks_in': 3, 'depth_in': 1, 'stride_in': 1, 'blocks_main': 2, 'depth_main': 3, 'stride_main': 1, 'temp_win': 1, 'graph_dist': 1, 'expand_ratio': 1, 'reduct_ratio': 2, 'act': 'hardswish', 'att_lay': 'stja', 'conv_lay': 'Sep', 'init_lay': 64, 'drop_prob': 0.25}
[ 2023-03-28 16:49:45,844 ] Memory reserved before training: 2097152
[ 2023-03-28 16:49:50,049 ] Epoch: 1/1, Training accuracy: 5/200(2.50%), Training time: 4.20s
[ 2023-03-28 16:49:50,049 ]
[ 2023-03-28 16:49:50,067 ] Memory reserved after training: 33554432
[ 2023-03-28 16:49:50,067 ] Evaluating student quality...
[ 2023-03-28 16:49:50,768 ] Top-1 accuracy: 1/200(0.50%), Top-5 accuracy: 14/200(7.00%), Mean loss:4.0968
[ 2023-03-28 16:49:50,769 ] Evaluating time: 0.70s, Speed: 571.18 sequnces/(second*GPU)
[ 2023-03-28 16:49:50,769 ]
[ 2023-03-28 16:49:50,773 ] Student quality is 0.005
[ 2023-03-28 16:49:50,804 ] Memory reserved after del: 2097152
[ 2023-03-28 16:49:50,804 ] Loading student...
[ 2023-03-28 16:49:50,818 ] Timestep 5
[ 2023-03-28 16:49:50,818 ] Training student 5 for 1 epochs
[ 2023-03-28 16:49:50,818 ] Student is: {'blocks_in': 1, 'depth_in': 3, 'stride_in': 1, 'blocks_main': 2, 'depth_main': 3, 'stride_main': 1, 'temp_win': 1, 'graph_dist': 1, 'expand_ratio': 3, 'reduct_ratio': 1, 'act': 'hardswish', 'att_lay': 'ca', 'conv_lay': 'SG', 'init_lay': 64, 'drop_prob': 0.15}
[ 2023-03-28 16:49:50,819 ] Memory reserved before training: 2097152
[ 2023-03-28 16:49:51,139 ] Epoch: 1/1, Batch: 1/25, Loss: 4.1028, LR: 0.0001
[ 2023-03-28 16:50:01,470 ] Epoch: 1/1, Training accuracy: 5/200(2.50%), Training time: 10.65s
[ 2023-03-28 16:50:01,471 ]
[ 2023-03-28 16:50:01,489 ] Memory reserved after training: 98566144
[ 2023-03-28 16:50:01,489 ] Evaluating student quality...
[ 2023-03-28 16:50:03,044 ] Top-1 accuracy: 1/200(0.50%), Top-5 accuracy: 25/200(12.50%), Mean loss:4.1401
[ 2023-03-28 16:50:03,045 ] Evaluating time: 1.55s, Speed: 257.32 sequnces/(second*GPU)
[ 2023-03-28 16:50:03,045 ]
[ 2023-03-28 16:50:03,053 ] Student quality is 0.005
[ 2023-03-28 16:50:03,089 ] Memory reserved after del: 2097152
[ 2023-03-28 16:50:03,089 ] Loading student...
[ 2023-03-28 16:50:03,146 ] Timestep 6
[ 2023-03-28 16:50:03,146 ] Training student 6 for 1 epochs
[ 2023-03-28 16:50:03,146 ] Student is: {'blocks_in': 1, 'depth_in': 2, 'stride_in': 1, 'blocks_main': 3, 'depth_main': 2, 'stride_main': 1, 'temp_win': 1, 'graph_dist': 1, 'expand_ratio': 3, 'reduct_ratio': 1, 'act': 'relu', 'att_lay': 'ca', 'conv_lay': 'Sep', 'init_lay': 128, 'drop_prob': 0.2}
[ 2023-03-28 16:50:03,146 ] Memory reserved before training: 2097152
Traceback (most recent call last):
File "/home/espen/PycharmProjects/GNN_NAS/main.py", line 38, in <module>
main()
File "/home/espen/PycharmProjects/GNN_NAS/main.py", line 24, in main
trainer_has.train_controller()
File "/home/espen/PycharmProjects/GNN_NAS/src/train_has.py", line 53, in train_controller
self._train_student(student, hp_state, epoch, self.train_epochs, student_id)
File "/home/espen/PycharmProjects/GNN_NAS/src/train_has.py", line 167, in _train_student
out, _ = student(x)
File "/home/espen/PycharmProjects/GNN_NAS/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/espen/PycharmProjects/GNN_NAS/src/model/student.py", line 87, in forward
x = self.main_stream(x)
File "/home/espen/PycharmProjects/GNN_NAS/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/espen/PycharmProjects/GNN_NAS/venv/lib/python3.8/site-packages/torch/nn/modules/container.py", line 204, in forward
input = module(input)
File "/home/espen/PycharmProjects/GNN_NAS/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/espen/PycharmProjects/GNN_NAS/src/model/layers.py", line 239, in forward
x = self.act(self.depth_conv(x))
File "/home/espen/PycharmProjects/GNN_NAS/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/espen/PycharmProjects/GNN_NAS/venv/lib/python3.8/site-packages/torch/nn/modules/container.py", line 204, in forward
input = module(input)
File "/home/espen/PycharmProjects/GNN_NAS/venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/espen/PycharmProjects/GNN_NAS/venv/lib/python3.8/site-packages/torch/nn/modules/batchnorm.py", line 171, in forward
return F.batch_norm(
File "/home/espen/PycharmProjects/GNN_NAS/venv/lib/python3.8/site-packages/torch/nn/functional.py", line 2450, in batch_norm
return torch.batch_norm(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 528.00 MiB (GPU 0; 23.69 GiB total capacity; 21.19 GiB already allocated; 342.56 MiB free; 21.90 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Any suggestions how I can remove the old student complelty from the GPU to make place for the newly sampled student?
Thanks a lot