I trained a HRFormer model on GPU without any errors. However, an inference by this model caused the following error message:
Traceback (most recent call last):
File "demo/image_demo.py", line 105, in <module>
main()
File "demo/image_demo.py", line 85, in main
batch_results = inference_topdown(model, args.img)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/apis/inference.py", line 192, in inference_topdown
results = model.test_step(batch)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 145, in test_step
return self._run_forward(data, mode='predict') # type: ignore
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 326, in _run_forward
results = self(**data, mode=mode)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/pose_estimators/base.py", line 142, in forward
return self.predict(inputs, data_samples)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/pose_estimators/topdown.py", line 103, in predict
_feats = self.extract_feat(inputs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/pose_estimators/base.py", line 188, in extract_feat
x = self.backbone(inputs)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/backbones/hrnet.py", line 583, in forward
y_list = self.stage2(x_list)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/container.py", line 227, in forward
input = module(input)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/backbones/hrnet.py", line 200, in forward
x[i] = self.branches[i](x[i])
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/container.py", line 227, in forward
input = module(input)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/backbones/hrformer.py", line 387, in forward
x = x + self.drop_path(self.ffn(self.norm2(x), H, W))
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/backbones/hrformer.py", line 313, in forward
x = layer(x)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/batchnorm.py", line 741, in forward
return F.batch_norm(
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/functional.py", line 2471, in batch_norm
return torch.batch_norm(
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument running_var in method wrapper_CUDA__cudnn_batch_norm)
The error says the tensor inputs to the function batch_norm
in torch.nn.functional
are not on the same device. I inspected these tensors by adding the followings to the batch_norm
function:
....
def batch_norm(
input: Tensor,
running_mean: Optional[Tensor],
running_var: Optional[Tensor],
weight: Optional[Tensor] = None,
bias: Optional[Tensor] = None,
training: bool = False,
momentum: float = 0.1,
eps: float = 1e-5,
) -> Tensor:
r"""Applies Batch Normalization for each channel across a batch of data.
See :class:`~torch.nn.BatchNorm1d`, :class:`~torch.nn.BatchNorm2d`,
:class:`~torch.nn.BatchNorm3d` for details.
"""
if has_torch_function_variadic(input, running_mean, running_var, weight, bias):
return handle_torch_function(
batch_norm,
(input, running_mean, running_var, weight, bias),
input,
running_mean,
running_var,
weight=weight,
bias=bias,
training=training,
momentum=momentum,
eps=eps,
)
if training:
_verify_batch_size(input.size())
""" What I added """
if torch.is_tensor(input):
print('input is on ', input.device)
if torch.is_tensor(weight):
print('weight is on ', weight.device)
if torch.is_tensor(bias):
print('bias is on ', bias.device)
if torch.is_tensor(running_mean):
print('running_mean is on ', running_mean.device) # ---> 'running mean' is on CPU!
if torch.is_tensor(training):
print('training is on ', training.device)
if torch.is_tensor(momentum):
print('momentum is on ', momentum.device)
if torch.is_tensor(eps):
print('eps is on ', eps.device)
# print('running_mean\n', running_mean.device)
# print('running_mean\n', running_mean)
return torch.batch_norm(
input, weight, bias, running_mean, running_var, training, momentum, eps, torch.backends.cudnn.enabled
)
and found that the variable running_mean
is on a CPU for some reason. I tried to move the variable to CUDA by adding below to the batch_norm
if not running_mean.is_cuda:
running_mean = running_mean.to('cuda')
Now all the tensors are on cuda:0 but the error still remains:
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
input is on cuda:0
weight is on cuda:0
bias is on cuda:0
running_mean is on cuda:0
Traceback (most recent call last):
File "demo/image_demo.py", line 105, in <module>
main()
File "demo/image_demo.py", line 85, in main
batch_results = inference_topdown(model, args.img)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/apis/inference.py", line 192, in inference_topdown
results = model.test_step(batch)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 145, in test_step
return self._run_forward(data, mode='predict') # type: ignore
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/mmengine/model/base_model/base_model.py", line 326, in _run_forward
results = self(**data, mode=mode)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/pose_estimators/base.py", line 142, in forward
return self.predict(inputs, data_samples)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/pose_estimators/topdown.py", line 103, in predict
_feats = self.extract_feat(inputs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/pose_estimators/base.py", line 188, in extract_feat
x = self.backbone(inputs)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/backbones/hrnet.py", line 583, in forward
y_list = self.stage2(x_list)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/container.py", line 227, in forward
input = module(input)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/backbones/hrnet.py", line 200, in forward
x[i] = self.branches[i](x[i])
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/container.py", line 227, in forward
input = module(input)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/backbones/hrformer.py", line 387, in forward
x = x + self.drop_path(self.ffn(self.norm2(x), H, W))
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/dkobayas/Pose-Estimation/mmpose/mmpose/models/backbones/hrformer.py", line 313, in forward
x = layer(x)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/modules/batchnorm.py", line 741, in forward
return F.batch_norm(
File "/depot/cfrueh/apps/env_mmpose_nospyder/lib/python3.8/site-packages/torch/nn/functional.py", line 2471, in batch_norm
return torch.batch_norm(
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument running_var in method wrapper_CUDA__cudnn_batch_norm)
I have no idea how to fix this issue. Any advice would be helpful!