Crash on routine only at end of epoch

I am following the torchvision tutorial for object detection… BUT

I have a small GPU, 6 GB of VRAM, but noticing the interesting behavior of things working for a while… then at the end of the epoch it seems to surge momentarily… is there something I can do to pause or wait while the GPU flushes RAM to smooth this out ?

/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/nn/functional.py:2854: UserWarning: The default behavior for interpolate/upsample with float scale_factor will change in 1.6.0 to align with other frameworks/libraries, and use scale_factor directly, instead of relying on the computed output size. If you wish to keep the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. 
  warnings.warn("The default behavior for interpolate/upsample with float scale_factor will change "
/opt/conda/conda-bld/pytorch_1587428266983/work/torch/csrc/utils/python_arg_parser.cpp:756: UserWarning: This overload of nonzero is deprecated:
	nonzero(Tensor input, *, Tensor out)
Consider using one of the following signatures instead:
	nonzero(Tensor input, *, bool as_tuple)
Epoch: [0] [  0/674] eta: 0:13:03 lr: 0.050000 loss: 20.7203 (20.7203) loss_classifier: 1.6230 (1.6230) loss_box_reg: 0.0011 (0.0011) loss_objectness: 0.6846 (0.6846) loss_rpn_box_reg: 18.4116 (18.4116) time: 1.1628 data: 0.3461 max mem: 4046
Epoch: [0] [100/674] eta: 0:06:04 lr: 0.050000 loss: 2.6173 (10.3594) loss_classifier: 0.6032 (2.4633) loss_box_reg: 0.0099 (0.9467) loss_objectness: 0.0117 (0.0408) loss_rpn_box_reg: 1.9455 (6.9086) time: 0.6297 data: 0.0130 max mem: 5263
Epoch: [0] [200/674] eta: 0:04:59 lr: 0.050000 loss: 2.8273 (7.9594) loss_classifier: 0.3240 (1.5040) loss_box_reg: 0.0056 (0.5082) loss_objectness: 0.0048 (0.0250) loss_rpn_box_reg: 2.3093 (5.9221) time: 0.6287 data: 0.0132 max mem: 5263
Epoch: [0] [300/674] eta: 0:03:55 lr: 0.050000 loss: 3.3294 (6.6886) loss_classifier: 0.1793 (1.1149) loss_box_reg: 0.0038 (0.3947) loss_objectness: 0.0064 (0.0186) loss_rpn_box_reg: 3.1317 (5.1605) time: 0.6309 data: 0.0134 max mem: 5263
Epoch: [0] [400/674] eta: 0:02:52 lr: 0.050000 loss: 3.8322 (6.3478) loss_classifier: 0.0586 (0.9138) loss_box_reg: 0.0054 (0.3241) loss_objectness: 0.0061 (0.0154) loss_rpn_box_reg: 3.8009 (5.0944) time: 0.6301 data: 0.0131 max mem: 5263
Traceback (most recent call last):
  File "training.py", line 147, in <module>
    train(data_conf=config_json, model_conf=model_conf)
  File "training.py", line 104, in train
    tfb_logger=logger)
  File "/bootstrap-pytorch-torchvision-fasterrcnn/references/detection/engine.py", line 43, in train_one_epoch
    losses.backward()
  File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/tensor.py", line 198, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/autograd/__init__.py", line 100, in backward
    allow_unreachable=True)  # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 392.00 MiB (GPU 0; 5.94 GiB total capacity; 4.75 GiB already allocated; 295.00 MiB free; 5.16 GiB reserved in total by PyTorch) (malloc at /opt/conda/conda-bld/pytorch_1587428266983/work/c10/cuda/CUDACachingAllocator.cpp:289)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x4e (0x7f3c74355b5e in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1f39d (0x7f3c7411739d in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x2058e (0x7f3c7411858e in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #3: THCStorage_resize + 0x96 (0x7f3c754496a6 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #4: at::native::(anonymous namespace)::resize_cuda_(at::Tensor&, c10::ArrayRef<long>, c10::optional<c10::MemoryFormat>) + 0x799 (0x7f3c76f7fb59 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #5: <unknown function> + 0x29fe533 (0x7f3c76f80533 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xd742b2 (0x7f3c752f62b2 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #7: <unknown function> + 0xea5eb6 (0x7f3c75427eb6 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #8: <unknown function> + 0xdc92e8 (0x7f3c7534b2e8 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #9: <unknown function> + 0xe224d0 (0x7f3c9ff694d0 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x29f9d0e (0x7f3ca1b40d0e in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #11: <unknown function> + 0xe224d0 (0x7f3c9ff694d0 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #12: at::Tensor::mm(at::Tensor const&) const + 0xf0 (0x7f3c9fb2d180 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #13: <unknown function> + 0x264517c (0x7f3ca178c17c in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::generated::AddmmBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x1b0 (0x7f3ca178c690 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #15: <unknown function> + 0x2ae8215 (0x7f3ca1c2f215 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #16: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x16f3 (0x7f3ca1c2c513 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7f3ca1c2d2f2 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #18: torch::autograd::Engine::thread_init(int) + 0x39 (0x7f3ca1c25969 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #19: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7f3ca4f6c558 in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #20: <unknown function> + 0xc819d (0x7f3ca79d719d in /home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/lib/../../../.././libstdc++.so.6)
frame #21: <unknown function> + 0x9609 (0x7f3cc1c9c609 in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #22: clone + 0x43 (0x7f3cc1bc3103 in /lib/x86_64-linux-gnu/libc.so.6)
def train_one_epoch(model_conf, model, optimizer, data_loader, device, epoch, tfb_logger):
    print_freq = model_conf["hyperParameters"]["display_interval"]
    iterations_per_epoch = len(data_loader) / model_conf["hyperParameters"]["batch_size"]

    model.train()
    metric_logger = utils.MetricLogger(delimiter=" ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    iterations = 0

    for images, targets in metric_logger.log_every(data_loader, print_freq, header):

        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        iterations += 1
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        if tfb_logger is not None:
            info = {
                'loss': losses_reduced,
                'loss_box_reg': loss_dict["loss_box_reg"],
                'loss_classifier': loss_dict["loss_classifier"],
                'loss_objectness': loss_dict["loss_objectness"],
                'loss_rpn_box_reg': loss_dict["loss_rpn_box_reg"]
            }

            tfb_logger.add_scalars(main_tag='logs_s_{}/losses'.format("1"),
                                   tag_scalar_dict=info,
                                   global_step=(epoch * len(data_loader)) + iterations)

    return metric_logger

Main training method

    for epoch in range(model_conf["hyperParameters"]["epoch_start"], model_conf["hyperParameters"]["epoch_max"] + 1):

        # train for one epoch, printing every 10 iterations
        train_one_epoch(model_conf=model_conf,
                        model=model,
                        optimizer=optimizer,
                        data_loader=training_data_loader,
                        device=device,
                        epoch=epoch,
                        tfb_logger=logger)

        # update the learning rate
        lr_scheduler.step()

        if model_conf["pytorch_engine"]["enable_tfb"]:
            logger.add_scalars(main_tag='logs_s_{}/lr'.format("1"),
                                   tag_scalar_dict={"lr": lr_scheduler.get_last_lr()},
                                   global_step=epoch)

        if epoch % 5 == 0 or epoch == model_conf["hyperParameters"]["epoch_max"]:
            save_name = os.path.join(output_dir,
                                     'faster_rcnn_{}_{}.pth'.format(model_conf["pytorch_engine"]["session"], epoch))
            torch.save({
                'session': model_conf["pytorch_engine"]["session"],
                'epoch': epoch,
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'model_conf': model_conf,
                'data_conf': data_conf,
            }, save_name)
            print('save model: {}'.format(save_name))

I am not sure this is an actual reason but one way is that at end of epoch send the model to cpu. model.to(‘cpu’) this will free the space before assigning the memory again. Just a bypass shortcut

1 Like