Hi, all. I am getting a weird illegal memory access
error whenever I try to train a FasterRCNN model with an image size of (1280,840,3) and a batch size of 3. The GPU used is Tesla K80 with CUDA 10.1 on an Ubuntu OS. I am Pytorch 1.5 and torchvision 0.6 Given below is the code snippet.
def from_numpy_to_tensor(images,labels_list):
images = torch.from_numpy(images).cuda()
for label in labels_list:
label["boxes"] = torch.from_numpy(label["boxes"]).cuda()
label["labels"] = torch.from_numpy(label["labels"]).cuda()
return images,labels_list
class CustomDataset(torch.utils.data.Dataset):
def __init__(self,xtr,ytr):
self.xtr = xtr
self.ytr = ytr
def __getitem__(self,idx):
img = self.xtr[idx]
tar = self.ytr[idx]
return img,tar
def __len__(self):
return len(self.xtr)
def collate_fn(batch):
return list(zip(*batch))
device = torch.device("cuda:0")
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False,num_classes=4)
#model = nn.DataParallel(model,device_ids=[0,1,2,3])
model.to(device)
optimizer = optim.Adam(model.parameters(),lr=0.000001)
x_train,y_train = from_numpy_to_tensor(images,labels)
dataset = CustomDataset(x_train,y_train)
dataloader = DataLoader(dataset,batch_size=3,collate_fn=collate_fn)
model.train()
for i in range(epochs):
print("Iter:",i)
logs = train_one_epoch(model,optimizer,dataloader,device,i,10)
print(logs)
The error I get is the following.
Traceback (most recent call last):
File "multi_gpu_FRCNN.py", line 125, in <module>
logs = train_one_epoch(model,optimizer,dataloader,device,i,10)
File "/workspace/Pytorch tutorials/engine.py", line 46, in train_one_epoch
losses.backward()
File "/opt/conda/lib/python3.7/site-packages/torch/tensor.py", line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/__init__.py", line 100, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: CUDA error: an illegal memory access was encountered (launch_kernel at /opt/conda/conda-bld/pytorch_1587428398394/work/aten/src/ATen/native/cuda/CUDALoops.cuh:112)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x4e (0x7f685af93b5e in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: void at::native::gpu_index_kernel<__nv_dl_wrapper_t<__nv_dl_tag<void (*)(at::TensorIterator&, c10::ArrayRef<long>, c10::ArrayRef<long>), &(void at::native::index_put_kernel_impl<at::native::OpaqueType<4> >(at::TensorIterator&, c10::ArrayRef<long>, c10::ArrayRef<long>)), 1u>> >(at::TensorIterator&, c10::ArrayRef<long>, c10::ArrayRef<long>, __nv_dl_wrapper_t<__nv_dl_tag<void (*)(at::TensorIterator&, c10::ArrayRef<long>, c10::ArrayRef<long>), &(void at::native::index_put_kernel_impl<at::native::OpaqueType<4> >(at::TensorIterator&, c10::ArrayRef<long>, c10::ArrayRef<long>)), 1u>> const&) + 0x797 (0x7f685d77d227 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #2: <unknown function> + 0x25b9a64 (0x7f685d779a64 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0xb610cf (0x7f6882cd60cf in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: at::native::_index_put_impl_(at::Tensor&, c10::ArrayRef<at::Tensor>, at::Tensor const&, bool, bool) + 0x491 (0x7f6882cd3901 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0xee23de (0x7f68830573de in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #6: at::native::index_put_(at::Tensor&, c10::ArrayRef<at::Tensor>, at::Tensor const&, bool) + 0x135 (0x7f6882cc3255 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xee210e (0x7f688305710e in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0x288fa88 (0x7f6884a04a88 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #9: torch::autograd::generated::IndexPutBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x251 (0x7f68847cf201 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x2ae8215 (0x7f6884c5d215 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #11: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0x16f3 (0x7f6884c5a513 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #12: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x7f6884c5b2f2 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::Engine::thread_init(int) + 0x39 (0x7f6884c53969 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x7f6887f9a558 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #15: <unknown function> + 0xc819d (0x7f688a9fd19d in /opt/conda/lib/python3.7/site-packages/torch/lib/../../../.././libstdc++.so.6)
frame #16: <unknown function> + 0x76db (0x7f68a2f046db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #17: clone + 0x3f (0x7f68a2c2d88f in /lib/x86_64-linux-gnu/libc.so.6)
The train_one_epoch
is taken from here. I have already tried this os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
. But this hasn’t made any difference as such. Is there a something wrong in this code?