RuntimeError with initialization at THCStorage.c

I am getting the following RuntimeError when running the learning loop for a CNN image classification task.

RuntimeError: Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 57, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "", line 29, in __getitem__
    img = torch.from_numpy(np.asarray(img, dtype=np.double)).to(device=self.device)
RuntimeError: cuda runtime error (3) : initialization error at /opt/conda/conda-bld/pytorch_1524577523076/work/aten/src/THC/generic/THCStorage.c:60

When my device is cpu, everything is fine. I get no errors and can even go for as many epochs as I want. However, when I switch to cuda device, then that RuntimeError is thrown. The stacktrace points to when I enumerate my DataLoader.

My Dataset looks like the following.

class ImageData(Dataset):
    def __init__(self, device, width=256, height=256, transform=None):
        self.device = device
        self.width = width
        self.height = height
        self.transform = transform
        _, _, y, _, x = self.__get_images_profile__()
        self.y = y
        self.x = x
        
    def __getitem__(self, index):
        img = Image.open(self.x[index])
        img = img.resize((self.width, self.height))
        img = img.convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        
        img = np.asarray(img).transpose(-1, 0, 1)
        img = img / 255.0
        img = torch.from_numpy(np.asarray(img, dtype=np.double)).to(device=self.device)
        label = torch.tensor(self.y[index], dtype=torch.long, device=self.device) 
        return img, label
    
    def __len__(self):
        return len(self.x)
    
    def __get_images_profile__(self):
        # code omitted

I then create my DataLoader as follows.

device = torch.device('cpu')
if torch.cuda.is_available() is True:
    device = torch.device('cuda')

dataset = ImageData(device=device, width=32, height=32)
dataloader = DataLoader(dataset, batch_size=10, shuffle=True, num_workers=1)

My CNN is defined as follows.

class Net(torch.nn.Module):
    
    #Our batch shape for input x is (3, 32, 32)
    
    def __init__(self):
        super(Net, self).__init__()
        
        #Input channels = 3, output channels = 18
        self.conv1 = torch.nn.Conv2d(3, 18, kernel_size=3, stride=1, padding=1)
        self.pool = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        #4608 input features, 64 output features (see sizing flow below)
        self.fc1 = torch.nn.Linear(18 * 16 * 16, 64)
        
        #64 input features, 10 output features for our 10 defined classes
        self.fc2 = torch.nn.Linear(64, 14942)
        
    def forward(self, x):
        x = x.float()
        #Computes the activation of the first convolution
        #Size changes from (3, 32, 32) to (18, 32, 32)
        x = F.relu(self.conv1(x))
        
        #Size changes from (18, 32, 32) to (18, 16, 16)
        x = self.pool(x)
        
        #Reshape data to input to the input layer of the neural net
        #Size changes from (18, 16, 16) to (1, 4608)
        #Recall that the -1 infers this dimension from the other given dimension
        x = x.view(-1, 18 * 16 *16)
        
        #Computes the activation of the first fully connected layer
        #Size changes from (1, 4608) to (1, 64)
        x = F.relu(self.fc1(x))
        
        #Computes the second fully connected layer (activation applied later)
        #Size changes from (1, 64) to (1, 10)
        x = self.fc2(x)
        return(x)

net = Net().to(device=device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

And finally, my learning loop is as follows.

for epoch in range(1):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(dataloader, 0): # The RuntimeError is thrown here
        # get the inputs
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 5000 == 0:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 5000))
            running_loss = 0.0

print('Finished Training')

Any ideas on what is going on? I’ve tested on both PyTorch with Python 2.7 and 3.6 without success and the exact same RuntimeError is thrown.

Here’s the environment.

  • AWS p2.8xlarge
  • AWS Deep Learning AMI
  • Python 2.7 and 3.6 with Conda 4.4.10
  • nvcc --version shows Cuda compilation tools, release 9.0, V9.0.176
  • pip list | grep torch shows torch 0.4.0 and torchvision 0.2.1

For completeness, here’s the stacktrace as shown in Jupyter.

RuntimeError Traceback (most recent call last)
in ()
2
3 running_loss = 0.0
----> 4 for i, data in enumerate(dataloader, 0):
5 # get the inputs
6 inputs, labels = data

/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/utils/data/dataloader.pyc in next(self)
284 self.reorder_dict[idx] = batch
285 continue
–> 286 return self._process_next_batch(batch)
287
288 next = next # Python 2 compatibility

/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/utils/data/dataloader.pyc in _process_next_batch(self, batch)
305 self._put_indices()
306 if isinstance(batch, ExceptionWrapper):
–> 307 raise batch.exc_type(batch.exc_msg)
308 return batch
309

RuntimeError: Traceback (most recent call last):
File “/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/utils/data/dataloader.py”, line 57, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File “”, line 29, in getitem
img = torch.from_numpy(np.asarray(img, dtype=np.double)).to(device=self.device)
RuntimeError: cuda runtime error (3) : initialization error at /opt/conda/conda-bld/pytorch_1524577523076/work/aten/src/THC/generic/THCStorage.c:60

Hi,

Were you able to solve this?