Currently trying to use pytorch data parallel to up batchsize as shown in code, but get 1) RuntimeError: Caught RuntimeError in replica 1 on device 5. and 2) RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED.
**Anyone who has worked with dataparallel before and had similar issues ? **
Code:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device_ids = [1, 5]
model = YoloV4_EfficentNet(nclasses = arguments['nclasses']).to(device)
model = nn.DataParallel(model, device_ids = device_ids)
def trainyolov4(arguments, train_loader, model, optimizer, scheduler, loss_f, scaled_anchors, scaler, mode = 'iou'):
model.train()
for batch_idx, (x, y) in enumerate(train_loader):
x = x.permute(0, 3, 1, 2) #.to(device)
y0, y1, y2 = (y[0], y[1], y[2]) #.to(device)
# x shape :-: (batchsize, channels, height, width)
with autocast():
preds = model(x)
loss_val = (
loss_f(preds[0], y0, scaled_anchors[0], mode = mode)
+ loss_f(preds[1], y1, scaled_anchors[1], mode = mode)
+ loss_f(preds[2], y2, scaled_anchors[2], mode = mode))
class_acc, noobj_acc, obj_acc = class_accuracy(preds, y, arguments["conf_thresh"])
optimizer.zero_grad()
scaler.scale(loss_val).backward()
scaler.step(optimizer)
scaler.update()
if arguments["one_cycle"] == True:
scheduler.step()
return (float(loss_val.item()), float(class_acc),float(noobj_acc), float(obj_acc))
Error :
- RuntimeError: Caught RuntimeError in replica 1 on device 5. and 2) RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED.
Traceback (most recent call last):
File "main.py", line 54, in <module>
train_model_with_args()
File "main.py", line 52, in train_model_with_args
initialize_with_args(arguments)
File "/home/thesis/train/train.py", line 214, in initialize_with_args
main(arguments)
File "/home/thesis/train/train.py", line 695, in main
train_loss_val, train_class_acc, train_noobj_acc, train_obj_acc = trainyolov4(arguments, train_loader, model, optimizer, scheduler, loss_f, scaled_anchors, scaler, mode = 'ciou')
File "/home/thesis/train/train.py", line 376, in trainyolov4
preds = model(x)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/.local/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/home/.local/lib/python3.8/site-packages/torch/_utils.py", line 543, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 1 on device 5.
Original Traceback (most recent call last):
File "/home/.local/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/thesis/models/yolov4.py", line 206, in forward
sclaed_pred2 = self.yolov4head[1](panet_scale2)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/thesis/models/yolov4.py", line 64, in forward
out = self.scaled_pred(x)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/container.py", line 204, in forward
input = module(input)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/thesis/models/yolov4.py", line 19, in forward
out = self.bn(out)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/batchnorm.py", line 171, in forward
return F.batch_norm(
File "/home/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 2450, in batch_norm
return torch.batch_norm(
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED
Tried:
- Explicitly setting cuda visible devices, 2) set cuda launch blocking = 1 for debug, 3) moving x, y to device vs not doing it as data parallel should do this for us , 4) made sure CUDA version 11.6 on the labs servers matches pytorch version and 5) checked that GPUs are reconized.