I am following the torchvision tutorial for object detection… BUT
I have a small GPU, 6 GB of VRAM, but noticing the interesting behavior of things working for a while… then at the end of the epoch it seems to surge momentarily… is there something I can do to pause or wait while the GPU flushes RAM to smooth this out ?
/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/nn/functional.py:2854: UserWarning: The default behavior for interpolate/upsample with float scale_factor will change in 1.6.0 to align with other frameworks/libraries, and use scale_factor directly, instead of relying on the computed output size. If you wish to keep the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details.
warnings.warn("The default behavior for interpolate/upsample with float scale_factor will change "
/opt/conda/conda-bld/pytorch_1587428266983/work/torch/csrc/utils/python_arg_parser.cpp:756: UserWarning: This overload of nonzero is deprecated:
nonzero(Tensor input, *, Tensor out)
Consider using one of the following signatures instead:
nonzero(Tensor input, *, bool as_tuple)
Epoch: [0] [ 0/674] eta: 0:13:03 lr: 0.050000 loss: 20.7203 (20.7203) loss_classifier: 1.6230 (1.6230) loss_box_reg: 0.0011 (0.0011) loss_objectness: 0.6846 (0.6846) loss_rpn_box_reg: 18.4116 (18.4116) time: 1.1628 data: 0.3461 max mem: 4046
Epoch: [0] [100/674] eta: 0:06:04 lr: 0.050000 loss: 2.6173 (10.3594) loss_classifier: 0.6032 (2.4633) loss_box_reg: 0.0099 (0.9467) loss_objectness: 0.0117 (0.0408) loss_rpn_box_reg: 1.9455 (6.9086) time: 0.6297 data: 0.0130 max mem: 5263
Epoch: [0] [200/674] eta: 0:04:59 lr: 0.050000 loss: 2.8273 (7.9594) loss_classifier: 0.3240 (1.5040) loss_box_reg: 0.0056 (0.5082) loss_objectness: 0.0048 (0.0250) loss_rpn_box_reg: 2.3093 (5.9221) time: 0.6287 data: 0.0132 max mem: 5263
Epoch: [0] [300/674] eta: 0:03:55 lr: 0.050000 loss: 3.3294 (6.6886) loss_classifier: 0.1793 (1.1149) loss_box_reg: 0.0038 (0.3947) loss_objectness: 0.0064 (0.0186) loss_rpn_box_reg: 3.1317 (5.1605) time: 0.6309 data: 0.0134 max mem: 5263
Epoch: [0] [400/674] eta: 0:02:52 lr: 0.050000 loss: 3.8322 (6.3478) loss_classifier: 0.0586 (0.9138) loss_box_reg: 0.0054 (0.3241) loss_objectness: 0.0061 (0.0154) loss_rpn_box_reg: 3.8009 (5.0944) time: 0.6301 data: 0.0131 max mem: 5263
Traceback (most recent call last):
File "training.py", line 147, in <module>
train(data_conf=config_json, model_conf=model_conf)
File "training.py", line 104, in train
tfb_logger=logger)
File "/bootstrap-pytorch-torchvision-fasterrcnn/references/detection/engine.py", line 43, in train_one_epoch
losses.backward()
File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/tensor.py", line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/autograd/__init__.py", line 100, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 392.00 MiB (GPU 0; 5.94 GiB total capacity; 4.75 GiB already allocated; 295.00 MiB free; 5.16 GiB reserved in total by PyTorch) (malloc at /opt/conda/conda-bld/pytorch_1587428266983/work/c10/cuda/CUDACachingAllocator.cpp:289)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x4e (0x7f3c74355b5e in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1f39d (0x7f3c7411739d in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x2058e (0x7f3c7411858e in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #3: THCStorage_resize + 0x96 (0x7f3c754496a6 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #4: at::native::(anonymous namespace)::resize_cuda_(at::Tensor&, c10::ArrayRef<long>, c10::optional<c10::MemoryFormat>) + 0x799 (0x7f3c76f7fb59 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #5: <unknown function> + 0x29fe533 (0x7f3c76f80533 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xd742b2 (0x7f3c752f62b2 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #7: <unknown function> + 0xea5eb6 (0x7f3c75427eb6 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #8: <unknown function> + 0xdc92e8 (0x7f3c7534b2e8 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #9: <unknown function> + 0xe224d0 (0x7f3c9ff694d0 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x29f9d0e (0x7f3ca1b40d0e in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #11: <unknown function> + 0xe224d0 (0x7f3c9ff694d0 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #12: at::Tensor::mm(at::Tensor const&) const + 0xf0 (0x7f3c9fb2d180 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #13: <unknown function> + 0x264517c (0x7f3ca178c17c in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::generated::AddmmBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x1b0 (0x7f3ca178c690 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #15: <unknown function> + 0x2ae8215 (0x7f3ca1c2f215 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #16: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x16f3 (0x7f3ca1c2c513 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7f3ca1c2d2f2 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #18: torch::autograd::Engine::thread_init(int) + 0x39 (0x7f3ca1c25969 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #19: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7f3ca4f6c558 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #20: <unknown function> + 0xc819d (0x7f3ca79d719d in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/../../../.././libstdc++.so.6)
frame #21: <unknown function> + 0x9609 (0x7f3cc1c9c609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #22: clone + 0x43 (0x7f3cc1bc3103 in /lib/x86_64-linux-gnu/libc.so.6)
def train_one_epoch(model_conf, model, optimizer, data_loader, device, epoch, tfb_logger):
print_freq = model_conf["hyperParameters"]["display_interval"]
iterations_per_epoch = len(data_loader) / model_conf["hyperParameters"]["batch_size"]
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
header = 'Epoch: [{}]'.format(epoch)
iterations = 0
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
loss_dict = model(images, targets)
iterations += 1
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print("Loss is {}, stopping training".format(loss_value))
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
losses.backward()
optimizer.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
if tfb_logger is not None:
info = {
'loss': losses_reduced,
'loss_box_reg': loss_dict["loss_box_reg"],
'loss_classifier': loss_dict["loss_classifier"],
'loss_objectness': loss_dict["loss_objectness"],
'loss_rpn_box_reg': loss_dict["loss_rpn_box_reg"]
}
tfb_logger.add_scalars(main_tag='logs_s_{}/losses'.format("1"),
tag_scalar_dict=info,
global_step=(epoch * len(data_loader)) + iterations)
return metric_logger
Main training method
for epoch in range(model_conf["hyperParameters"]["epoch_start"], model_conf["hyperParameters"]["epoch_max"] + 1):
# train for one epoch, printing every 10 iterations
train_one_epoch(model_conf=model_conf,
model=model,
optimizer=optimizer,
data_loader=training_data_loader,
device=device,
epoch=epoch,
tfb_logger=logger)
# update the learning rate
lr_scheduler.step()
if model_conf["pytorch_engine"]["enable_tfb"]:
logger.add_scalars(main_tag='logs_s_{}/lr'.format("1"),
tag_scalar_dict={"lr": lr_scheduler.get_last_lr()},
global_step=epoch)
if epoch % 5 == 0 or epoch == model_conf["hyperParameters"]["epoch_max"]:
save_name = os.path.join(output_dir,
'faster_rcnn_{}_{}.pth'.format(model_conf["pytorch_engine"]["session"], epoch))
torch.save({
'session': model_conf["pytorch_engine"]["session"],
'epoch': epoch,
'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'model_conf': model_conf,
'data_conf': data_conf,
}, save_name)
print('save model: {}'.format(save_name))