I am getting the following RuntimeError when running the learning loop for a CNN image classification task.
RuntimeError: Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 57, in _worker_loop samples = collate_fn([dataset[i] for i in batch_indices]) File "", line 29, in __getitem__ img = torch.from_numpy(np.asarray(img, dtype=np.double)).to(device=self.device) RuntimeError: cuda runtime error (3) : initialization error at /opt/conda/conda-bld/pytorch_1524577523076/work/aten/src/THC/generic/THCStorage.c:60
When my device is cpu
, everything is fine. I get no errors and can even go for as many epochs as I want. However, when I switch to cuda
device, then that RuntimeError is thrown. The stacktrace points to when I enumerate my DataLoader
.
My Dataset
looks like the following.
class ImageData(Dataset):
def __init__(self, device, width=256, height=256, transform=None):
self.device = device
self.width = width
self.height = height
self.transform = transform
_, _, y, _, x = self.__get_images_profile__()
self.y = y
self.x = x
def __getitem__(self, index):
img = Image.open(self.x[index])
img = img.resize((self.width, self.height))
img = img.convert('RGB')
if self.transform is not None:
img = self.transform(img)
img = np.asarray(img).transpose(-1, 0, 1)
img = img / 255.0
img = torch.from_numpy(np.asarray(img, dtype=np.double)).to(device=self.device)
label = torch.tensor(self.y[index], dtype=torch.long, device=self.device)
return img, label
def __len__(self):
return len(self.x)
def __get_images_profile__(self):
# code omitted
I then create my DataLoader
as follows.
device = torch.device('cpu')
if torch.cuda.is_available() is True:
device = torch.device('cuda')
dataset = ImageData(device=device, width=32, height=32)
dataloader = DataLoader(dataset, batch_size=10, shuffle=True, num_workers=1)
My CNN is defined as follows.
class Net(torch.nn.Module):
#Our batch shape for input x is (3, 32, 32)
def __init__(self):
super(Net, self).__init__()
#Input channels = 3, output channels = 18
self.conv1 = torch.nn.Conv2d(3, 18, kernel_size=3, stride=1, padding=1)
self.pool = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
#4608 input features, 64 output features (see sizing flow below)
self.fc1 = torch.nn.Linear(18 * 16 * 16, 64)
#64 input features, 10 output features for our 10 defined classes
self.fc2 = torch.nn.Linear(64, 14942)
def forward(self, x):
x = x.float()
#Computes the activation of the first convolution
#Size changes from (3, 32, 32) to (18, 32, 32)
x = F.relu(self.conv1(x))
#Size changes from (18, 32, 32) to (18, 16, 16)
x = self.pool(x)
#Reshape data to input to the input layer of the neural net
#Size changes from (18, 16, 16) to (1, 4608)
#Recall that the -1 infers this dimension from the other given dimension
x = x.view(-1, 18 * 16 *16)
#Computes the activation of the first fully connected layer
#Size changes from (1, 4608) to (1, 64)
x = F.relu(self.fc1(x))
#Computes the second fully connected layer (activation applied later)
#Size changes from (1, 64) to (1, 10)
x = self.fc2(x)
return(x)
net = Net().to(device=device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
And finally, my learning loop is as follows.
for epoch in range(1): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(dataloader, 0): # The RuntimeError is thrown here
# get the inputs
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 5000 == 0: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 5000))
running_loss = 0.0
print('Finished Training')
Any ideas on what is going on? I’ve tested on both PyTorch with Python 2.7 and 3.6 without success and the exact same RuntimeError is thrown.
Here’s the environment.
- AWS p2.8xlarge
- AWS Deep Learning AMI
- Python 2.7 and 3.6 with Conda 4.4.10
-
nvcc --version
shows Cuda compilation tools, release 9.0, V9.0.176 -
pip list | grep torch
shows torch 0.4.0 and torchvision 0.2.1
For completeness, here’s the stacktrace as shown in Jupyter.
RuntimeError Traceback (most recent call last)
in ()
2
3 running_loss = 0.0
----> 4 for i, data in enumerate(dataloader, 0):
5 # get the inputs
6 inputs, labels = data
/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/utils/data/dataloader.pyc in next(self)
284 self.reorder_dict[idx] = batch
285 continue
–> 286 return self._process_next_batch(batch)
287
288 next = next # Python 2 compatibility
/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/utils/data/dataloader.pyc in _process_next_batch(self, batch)
305 self._put_indices()
306 if isinstance(batch, ExceptionWrapper):
–> 307 raise batch.exc_type(batch.exc_msg)
308 return batch
309
RuntimeError: Traceback (most recent call last):
File “/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/utils/data/dataloader.py”, line 57, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File “”, line 29, in getitem
img = torch.from_numpy(np.asarray(img, dtype=np.double)).to(device=self.device)
RuntimeError: cuda runtime error (3) : initialization error at /opt/conda/conda-bld/pytorch_1524577523076/work/aten/src/THC/generic/THCStorage.c:60