Hello,
I’m trying to run a CNN on a gpu through the command prompt (instead of jupyter). I keep getting the following error:
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THC/THCTensorIndex.cu:307: block: [0,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
C:/w/1/s/tmp_conda_3.7_100118/conda/conda-bld/pytorch_1579082551706/work/aten/src/THC/THCTensorIndex.cu:307: block: [0,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
THCudaCheck FAIL file=..\aten\src\THC\THCCachingHostAllocator.cpp line=278 error=710 : device-side assert triggered
Traceback (most recent call last):
File "inspection_model.py", line 544, in <module>
train_roof_df, test_roof_df,train_losses, test_losses = roof_run()
File "inspection_model.py", line 498, in roof_run
outputs = roof_model(image, numerical_data, categorical_data)
File "C:\Users\JORDAN.HOWELL.GITDIR\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\torch\nn\modules\module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "inspection_model.py", line 450, in forward
cat_embedd = torch.cat(embeddings, 1)
RuntimeError: cuda runtime error (710) : device-side assert triggered at ..\aten\src\THC\THCCachingHostAllocator.cpp:278
This does run on the CPU but I have over 100,000 photos so I want my GPUs help.
Here is my model:
class Image_Model(nn.Module):
def __init__(self, embedding_size):
super().__init__()
self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
self.embedding_dropout = nn.Dropout(p = 0.04)
self.cnn = models.vgg19(pretrained=True)
for param in self.cnn.parameters():
param_requires_grad = False
n_features = self.cnn.classifier[6].out_features
self.fc2 = nn.Sequential(nn.Linear(n_features, 1049))
self.fc3 = nn.Sequential(nn.Linear(1049, 256))
self.fc5 = nn.Dropout(p = 0.04)
self.fc9 = nn.Sequential(nn.Linear(256, 2))
def forward(self, image, numerical_columns, cat_columns):
embeddings = []
for i, e in enumerate(self.all_embeddings):
embeddings.append(e(cat_columns[:,i]))
cat_embedd = torch.cat(embeddings, 1)
x = self.cnn(image)
x = torch.cat((x, numerical_columns), dim = 1)
x = torch.cat((x, cat_embedd), dim = 1)
x = F.relu(self.fc2(x))
x = self.fc3(x)
x = self.fc5(x)
x = F.relu(self.fc9(x))
x = F.log_softmax(x)
return x
torch.manual_seed(1010)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
roof_model = Image_Model(embedding_size=embeddings).to(device)
criterion = torch.nn.NLLLoss().to(device)
optimizer = torch.optim.Adam(roof_model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience = 2, verbose = True, min_lr = .00000001)
epochs = 2
def roof_run():
train_losses = np.zeros(epochs)
test_losses = np.zeros(epochs)
t0 = datetime.now()
train_roof_predictions = []
test_roof_predictions = []
for epoch in range(epochs):
print(f"epoch, {epoch}")
train_loss = []
for image, label, policy, categorical_data, numerical_data in train_loader_roof:
image = image.to(device)
label = label.long()
label = label.to(device)
numerical_data = numerical_data.to(device)
categorical_data = categorical_data.to(device)
optimizer.zero_grad()
outputs = roof_model(image, numerical_data, categorical_data)
policy_tuple = (policy.cpu().data.numpy().item(), np.exp(outputs[0][0].cpu().data.numpy()))
train_roof_predictions.append(policy_tuple)
loss = criterion(outputs, label)
train_loss.append(loss.item())
loss.backward()
optimizer.step()
train_loss = np.mean(train_loss)
test_loss = []
print(f"testing for epoch {epoch}")
for image, label, policy, categorical_data, numerical_data in test_loader_roof:
# move data to GPU
image = image.to(device)
label = label.to(device)
policy = policy.to(device)
categorical_data = categorical_data.to(device)
numerical_data = numerical_data.to(device)
outputs = roof_model(image, numerical_data, categorical_data)
policy_test_tuple = (policy.cpu().data.numpy().item(), np.exp(outputs[0][0].cpu().data.numpy()))
test_roof_predictions.append(policy_test_tuple)
loss = criterion(outputs, label)
test_loss.append(loss.item())
test_loss = np.mean(test_loss)
scheduler.step(test_loss)
#save the losses
train_losses[epoch] = train_loss
test_losses[epoch] = test_loss
if epoch % 2 == 0:
print(f"train loss: {train_losses[epoch]}, test loss: {test_losses[epoch]}")
if es.step(test_loss):
break
dt = datetime.now() - t0
print('Duration:', dt)
return train_roof_df, test_roof_df, train_losses, test_losses
train_roof_df, test_roof_df,train_losses, test_losses = roof_run()
I’m not sure what to do from here. The error seems to hit at the following place in the model:
File "inspection_model.py", line 452, in forward cat_embedd = torch.cat(embeddings, 1)
I’m not sure what is wrong at that particular line. It will run on the GPU when I don’t use categorical embeddings but I need those for the predictive power.