Hello,
I’m working with 4 GPUs and getting an error (below):
opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [6,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [7,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [8,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [9,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1579022060824/work/aten/src/THC/THCTensorIndex.cu:307: void indexSelectSmallIndex(TensorInfo<T, IndexType>, TensorInfo<T, IndexType>, TensorInfo<long, IndexType>, int, int, IndexType, long) [with T = float, IndexType = unsigned int, DstDim = -1, SrcDim = -1, IdxDim = -1]: block: [0,0,0], thread: [10,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
Traceback (most recent call last):
File "model.py", line 306, in <module>
main()
File "model.py", line 236, in main
mp.spawn(train, nprocs=args.gpus, args=(args,))
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 171, in spawn
while not spawn_context.join():
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/home/scott.farmers/model.py", line 278, in train
y_pred = model(image, numerical_data, categorical_data)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 447, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/scott.farmers/model.py", line 212, in forward
x1 = self.cnn(image)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torchvision/models/resnet.py", line 216, in forward
return self._forward_impl(x)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torchvision/models/resnet.py", line 199, in _forward_impl
x = self.conv1(x)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 345, in forward
return self.conv2d_forward(input, self.weight)
File "/home/scott.farmers/miniconda3/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 342, in conv2d_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
Here is my model:
class image_Dataset(Dataset):
'''
image class data set
'''
def __init__(self, data, transform = None):
'''
Args:
------------------------------------------------------------
data = dataframe
image = column in dataframe with absolute path to the image
label = column in dataframe that is the target classification variable
numerical_columns = numerical columns from data
categorical_columns = categorical columns from data
policy = ID variable
'''
self.image_frame = data
self.transform = transform
def __len__(self):
return len(self.image_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
label = self.image_frame.loc[idx, 'target']
pic = Path(self.image_frame.loc[idx,'location_y'])
img = Image.open(pic)
policy = self.image_frame.loc[idx, 'policy']
sample = {'image': img, 'policy': policy, 'label':label}
numerical_data = self.image_frame.loc[idx, numerical_columns]
numerical_data = torch.tensor(numerical_data, dtype = torch.float)
if self.transform:
image = self.transform(img)
for category in categorical_columns:
self.image_frame[category] = self.image_frame[category].astype('category')
self.image_frame[category] = self.image_frame[category].astype('category').cat.codes.values
#categorical_column_sizes = [len(self.image_frame[column].astype('category')) for column in categorical_columns]
#categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_column_sizes]
categorical_data = self.image_frame.loc[idx, categorical_columns]
categorical_data = torch.tensor(categorical_data, dtype = torch.int64)
return image, label, policy, numerical_data, categorical_data
data_train_loader = image_Dataset(data = df_train1, transform = train_transform)
train_loader = DataLoader(data_train_loader, batch_size=10, shuffle = True)
class Image_Embedd(nn.Module):
def __init__(self, embedding_size):
'''
Args
---------------------------
embedding_size: Contains the embedding size for the categorical columns
num_numerical_cols: Stores the total number of numerical columns
output_size: The size of the output layer or the number of possible outputs.
layers: List which contains number of neurons for all the layers.
p: Dropout with the default value of 0.5
'''
super().__init__()
self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
self.embedding_dropout = nn.Dropout(p = .04)
self.cnn = models.resnet50(pretrained=True)
self.cnn.fc = nn.Linear(self.cnn.fc.in_features, 1000)
self.fc1 = nn.Linear(1000, 1071)
self.fc2 = nn.Linear(1071, 128)
self.fc3 = nn.Linear(128, 2)
#define the foward method
def forward(self, image, x_numerical, x_categorical):
embeddings = []
for i, e in enumerate(self.all_embeddings):
embeddings.append(e(x_categorical[:,i]))
x = torch.cat(embeddings, 1)
x = self.embedding_dropout(x)
x1 = self.cnn(image)
x2 = x_numerical
x3 = torch.cat((x1, x2), dim = 1)
x4 = torch.cat((x, x3), dim = 1)
x4 = F.relu(self.fc2(x4))
x4 = self.fc3(x4)
x4 = F.log_softmax(x4)
return x4
Here is my defined training run:
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default = 1, type = int, metavar='N',
help = 'number of data loading workers (default = 4)')
parser.add_argument('-g', '--gpus', default = 1, type = int,
help = 'number of gpus per node')
parser.add_argument('-nr', '--nr', default = 0, type = int,
help = 'ranking within the nodes')
parser.add_argument('--epochs', default = 2, type=int, metavar='N',
help = 'number of total epochs to run')
args = parser.parse_args()
args.world_size = args.gpus*args.nodes
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
mp.spawn(train, nprocs=args.gpus, args=(args,))
def train(gpu, args):
epochs = 500
rank = args.nr*args.gpus + gpu
dist.init_process_group(backend ='gloo', init_method='env://', world_size = args.world_size, rank = rank)
torch.manual_seed(0)
model = Image_Embedd(embedding_size=train_categorical_embedding_sizes)
model.cuda(gpu)
# torch.cuda.set_device(gpu)
max_trn_batch = 11053
criterion = torch.nn.NLLLoss().cuda(gpu)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#Wrap the model
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
model.cuda(gpu)
start = datetime.now()
total_step = len(train_loader)
for i in range(epochs):
for b, (image, label, policy, numerical_data, categorical_data) in enumerate(train_loader):
image = image.cuda(non_blocking=True)
label = label.cuda(non_blocking=True)
numerical_data = numerical_data.cuda(non_blocking=True)
categorical_data = categorical_data.cuda(non_blocking=True)
optimizer.zero_grad()
#count batches
b += 1
#throttle teh batches
if b == max_trn_batch:
break
y_pred = model(image, numerical_data, categorical_data)
single_loss = criterion(y_pred, label)
# statistics
print("working")
single_loss.backward()
optimizer.step()
if (b+1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {.4f}'.format(
epoch + 1,
args.epochs,
b + 1,
total_step,
loss.item())
)
# aggregated_losses.append(single_loss.cpu().data.numpy())
# scheduler.step(single_loss)
if gpu == 0:
print("Training complete in: " + str(datetime.now() - start))
if __name__=='__main__':
main()
This is my first time working with multiple GPUs. I tried switching the backend from gloo
to nccl
with no luck. Any help would be greatly appreciated.