I have four GPUs and I want to use them all while training but when I use train_one_epoch
an unusual error comes up. I must mention that running the script using only one GPU works fine (although it’s really slow)
The setup:
pytorch 1.3.1
torchvision 0.4.2
4X1080ti
CUDA 10.2
NVIDIA Driver 440.33.01
This is part of my code if more detailed one is necessary it will be provided.
# define training and validation data loaders
train_loader = DataLoader(dataset_train, batch_size=4, shuffle=True, num_workers=12, collate_fn=utils.collate_fn)
dev_loader = DataLoader(dataset_validation, batch_size=4, shuffle=True, num_workers=12, collate_fn=utils.collate_fn)
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# get the model using our helper function
model: torch.nn.Module = get_fast_rcnn_for_fine_tune(num_classes)
# duplicate the model to all available gpus
model = torch.nn.DataParallel(model)
# move model to the right device
model.to(device)
and the error:
Traceback (most recent call last):
File "/home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 28, in scatter
res = scatter_map(inputs)
File "/home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 15, in scatter_map
return list(zip(*map(scatter_map, obj)))
File "/home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 17, in scatter_map
return list(map(list, zip(*map(scatter_map, obj))))
File "/home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 19, in scatter_map
return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
File "/home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 15, in scatter_map
return list(zip(*map(scatter_map, obj)))
File "/home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 13, in scatter_map
return Scatter.apply(target_gpus, None, dim, obj)
File "/home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 89, in forward
outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
File "/home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/cuda/comm.py", line 147, in scatter
return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
RuntimeError: chunk expects at least a 1-dimensional tensor (chunk at /pytorch/aten/src/ATen/native/TensorShape.cpp:222)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x33 (0x7f14b1918813 in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: at::native::chunk(at::Tensor const&, long, long) + 0x459 (0x7f14b37177b9 in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #2: <unknown function> + 0x1ec6de9 (0x7f14b3a03de9 in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #3: <unknown function> + 0x3a57ecb (0x7f14b5594ecb in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #4: <unknown function> + 0x1f0d9f3 (0x7f14b3a4a9f3 in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #5: at::Tensor::chunk(long, long) const + 0xe9 (0x7f14b3650e69 in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #6: torch::cuda::scatter(at::Tensor const&, c10::ArrayRef<long>, c10::optional<std::vector<long, std::allocator<long> > > const&, long, c10::optional<std::vector<c10::optional<c10::cuda::CUDAStream>, std::allocator<c10::optional<c10::cuda::CUDAStream> > > > const&) + 0x29c (0x7f14b60c90bc in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #7: <unknown function> + 0x77fecf (0x7f14f9b3decf in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #8: <unknown function> + 0x2110f4 (0x7f14f95cf0f4 in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #9: _PyCFunction_FastCallDict + 0x154 (0x555f36d45744 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #10: <unknown function> + 0x198610 (0x555f36dcc610 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #11: _PyEval_EvalFrameDefault + 0x30a (0x555f36df138a in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #12: <unknown function> + 0x1918e4 (0x555f36dc58e4 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #13: <unknown function> + 0x192771 (0x555f36dc6771 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #14: <unknown function> + 0x198505 (0x555f36dcc505 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #15: _PyEval_EvalFrameDefault + 0x30a (0x555f36df138a in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #16: PyEval_EvalCodeEx + 0x329 (0x555f36dc7289 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #17: <unknown function> + 0x194094 (0x555f36dc8094 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #18: PyObject_Call + 0x3e (0x555f36d4554e in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #19: THPFunction_apply(_object*, _object*) + 0xa4f (0x7f14f985d5ef in /home/lab/avivshamsian/.local/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #20: _PyCFunction_FastCallDict + 0x91 (0x555f36d45681 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #21: <unknown function> + 0x198610 (0x555f36dcc610 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #22: _PyEval_EvalFrameDefault + 0x30a (0x555f36df138a in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #23: <unknown function> + 0x191bfe (0x555f36dc5bfe in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #24: _PyFunction_FastCallDict + 0x1bc (0x555f36dc6c4c in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #25: _PyObject_FastCallDict + 0x26f (0x555f36d45b0f in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #26: <unknown function> + 0x12c272 (0x555f36d60272 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #27: PyIter_Next + 0xe (0x555f36d87e3e in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #28: PySequence_Tuple + 0x1f5 (0x555f36d8cd95 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #29: _PyEval_EvalFrameDefault + 0x53c0 (0x555f36df6440 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #30: <unknown function> + 0x191bfe (0x555f36dc5bfe in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #31: _PyFunction_FastCallDict + 0x1bc (0x555f36dc6c4c in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #32: _PyObject_FastCallDict + 0x26f (0x555f36d45b0f in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #33: <unknown function> + 0x12c272 (0x555f36d60272 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #34: PyIter_Next + 0xe (0x555f36d87e3e in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #35: PySequence_Tuple + 0x215 (0x555f36d8cdb5 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #36: _PyEval_EvalFrameDefault + 0x53c0 (0x555f36df6440 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #37: <unknown function> + 0x191bfe (0x555f36dc5bfe in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #38: _PyFunction_FastCallDict + 0x1bc (0x555f36dc6c4c in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #39: _PyObject_FastCallDict + 0x26f (0x555f36d45b0f in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #40: <unknown function> + 0x12c272 (0x555f36d60272 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #41: PyIter_Next + 0xe (0x555f36d87e3e in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #42: PySequence_Tuple + 0xf9 (0x555f36d8cc99 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #43: _PyEval_EvalFrameDefault + 0x53c0 (0x555f36df6440 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #44: <unknown function> + 0x191bfe (0x555f36dc5bfe in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #45: _PyFunction_FastCallDict + 0x1bc (0x555f36dc6c4c in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #46: _PyObject_FastCallDict + 0x26f (0x555f36d45b0f in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #47: <unknown function> + 0x12c272 (0x555f36d60272 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #48: PyIter_Next + 0xe (0x555f36d87e3e in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #49: PySequence_Tuple + 0x1f5 (0x555f36d8cd95 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #50: _PyEval_EvalFrameDefault + 0x53c0 (0x555f36df6440 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #51: <unknown function> + 0x191bfe (0x555f36dc5bfe in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #52: <unknown function> + 0x192771 (0x555f36dc6771 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #53: <unknown function> + 0x198505 (0x555f36dcc505 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #54: _PyEval_EvalFrameDefault + 0x30a (0x555f36df138a in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #55: <unknown function> + 0x191bfe (0x555f36dc5bfe in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #56: <unknown function> + 0x192771 (0x555f36dc6771 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #57: <unknown function> + 0x198505 (0x555f36dcc505 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #58: _PyEval_EvalFrameDefault + 0x30a (0x555f36df138a in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #59: <unknown function> + 0x1918e4 (0x555f36dc58e4 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #60: <unknown function> + 0x192771 (0x555f36dc6771 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #61: <unknown function> + 0x198505 (0x555f36dcc505 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #62: _PyEval_EvalFrameDefault + 0x10c7 (0x555f36df2147 in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
frame #63: <unknown function> + 0x19253b (0x555f36dc653b in /home/lab/avivshamsian/miniconda3/envs/train_od_model/bin/python)
terminate called without an active exception
Process finished with exit code 134 (interrupted by signal 6: SIGABRT)