Hello, I encountered a problem about cuda out of memory when I have loaded deepfill model to inpaint one image. Which puzzles me is: I have already loaded the deepfill model, my batch_size is 1, because I only inference one image at a time, and there is torch.no_grad(). As you can see these two error messages below, they allocated nearly 7GB and 12GB at the 6th iteration. But why the forward function allocated a huge GPU memory during the iteration? I thought only this one image and the model cost GPU memory, so the GPU memory consumption should be stable during the inference? Thanks for your help in advance.
for i, imgid in tqdm(enumerate(img_map.keys())): # an image list
# get an masked image
masked_img_path = img_map[imgid]
# get the corresponding mask
mask_path = mask_map[imgid]
img_name = os.path.basename(masked_img_path)
# use deepfill model to inpaint this image, where inpainting_inference is from mmedit.apis
result = inpainting_inference(model, masked_img_path, mask_path)
result = tensor2img(result, min_max=(-1, 1))[..., ::-1]
h, w = mmcv.imread(masked_img_path).shape[:2]
result = result[:h, :w, ...]
save_path = os.path.join(args.clean_save_dir, img_name)
mmcv.imwrite(result, save_path)
In inpainting_inference function:
def inpainting_inference(model, masked_img, mask):
"""Inference image with the model.
Args:
model (nn.Module): The loaded model.
masked_img (str): File path of image with mask.
mask (str): Mask file path.
Returns:
Tensor: The predicted inpainting result.
"""
device = next(model.parameters()).device # model device
infer_pipeline = [
dict(type='LoadImageFromFile', key='masked_img'),
dict(type='LoadMask', mask_mode='file', mask_config=dict()),
dict(type='Pad', keys=['masked_img', 'mask'], mode='reflect'),
dict(
type='Normalize',
keys=['masked_img'],
mean=[127.5] * 3,
std=[127.5] * 3,
to_rgb=False),
dict(type='GetMaskedImage', img_name='masked_img'),
dict(
type='Collect',
keys=['masked_img', 'mask'],
meta_keys=['masked_img_path']),
dict(type='ImageToTensor', keys=['masked_img', 'mask'])
]
# build the data pipeline
test_pipeline = Compose(infer_pipeline)
# prepare data
data = dict(masked_img_path=masked_img, mask_path=mask)
data = test_pipeline(data)
data = collate([data], samples_per_gpu=1)
if 'cuda' in str(device):
data = scatter(data, [device])[0]
else:
data.pop('meta')
# forward the model
with torch.no_grad():
result = model(test_mode=True, **data)
return result['fake_img']
0it [00:00, ?it/s]
1it [00:00, 6.32it/s]
3it [00:00, 8.10it/s]
4it [00:01, 3.10it/s]
5it [00:01, 3.81it/s]
6it [00:02, 2.52it/s]
Traceback (most recent call last):
File "/app/inference.py", line 138, in <module>
inpainting(args)
File "/app/preprocess.py", line 63, in inpainting
result = inpainting_inference(model, masked_img_path, mask_path)
File "/usr/local/lib/python3.9/site-packages/mmedit/apis/inpainting_inference.py", line 51, in inpainting_inference
result = model(test_mode=True, **data)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 119, in new_func
return old_func(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/inpaintors/one_stage.py", line 135, in forward
return self.forward_test(masked_img, mask, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/inpaintors/two_stage.py", line 78, in forward_test
stage1_fake_res, stage2_fake_res = self.generator(input_x)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 119, in new_func
return old_func(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/backbones/encoder_decoders/two_stage_encoder_decoder.py", line 70, in forward
stage2_res, offset = self.stage2(stage2_input, mask)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/components/refiners/deepfill_refiner.py", line 68, in forward
attention_x, offset = self.contextual_attention_neck(
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/backbones/encoder_decoders/necks/contextual_attention_neck.py", line 70, in forward
x, offset = self.contextual_attention(x, x, mask)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/common/contextual_attention.py", line 124, in forward
correlation_map = self.fuse_correlation_map(
File "/usr/local/lib/python3.9/site-packages/mmedit/models/common/contextual_attention.py", line 219, in fuse_correlation_map
map_ = map_.permute(0, 3, 1, 2).contiguous()
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.94 GiB (GPU 0; 15.77 GiB total capacity; 7.89 GiB already allocated; 6.66 GiB free; 8.50 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
0it [00:00, ?it/s]
1it [00:00, 1.09it/s]
2it [00:01, 2.21it/s]
3it [00:01, 2.54it/s]
4it [00:02, 1.41it/s]
5it [00:02, 1.91it/s]
6it [00:02, 2.50it/s]
6it [00:03, 1.58it/s]
Traceback (most recent call last):
File "/app/inference.py", line 126, in <module>
inpainting(args)
File "/app/preprocess.py", line 61, in inpainting
result = inpainting_inference(model, masked_img_path, mask_path)
File "/usr/local/lib/python3.9/site-packages/mmedit/apis/inpainting_inference.py", line 51, in inpainting_inference
result = model(test_mode=True, **data)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 119, in new_func
return old_func(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/inpaintors/one_stage.py", line 135, in forward
return self.forward_test(masked_img, mask, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/inpaintors/two_stage.py", line 78, in forward_test
stage1_fake_res, stage2_fake_res = self.generator(input_x)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmcv/runner/fp16_utils.py", line 119, in new_func
return old_func(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/backbones/encoder_decoders/two_stage_encoder_decoder.py", line 70, in forward
stage2_res, offset = self.stage2(stage2_input, mask)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/components/refiners/deepfill_refiner.py", line 68, in forward
attention_x, offset = self.contextual_attention_neck(
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/backbones/encoder_decoders/necks/contextual_attention_neck.py", line 70, in forward
x, offset = self.contextual_attention(x, x, mask)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/common/contextual_attention.py", line 121, in forward
correlation_map = self.patch_correlation(x, context_cols)
File "/usr/local/lib/python3.9/site-packages/mmedit/models/common/contextual_attention.py", line 158, in patch_correlation
patch_corr = F.conv2d(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 12.36 GiB (GPU 0; 15.77 GiB total capacity; 1.53 GiB already allocated; 11.70 GiB free; 3.68 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF