Hello everyone, I’m trying to train a simple architecture on ImageNet on multiple GPUs.
it works fine with one GPU, however, when I tried to use multiple GPUS, it crashes with this error :
RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)
This is the architecture :
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
class simpnet_imgnet_drpall(nn.Module):
"""
args: classes
scale
network_idx (0,1):simpnet5m, simpnet8m
mode : stride mode (1,2,3,4,5)
"""
def __init__(self, classes=1000, scale=1.0, network_idx=0, mode=1, simpnet_name='simpnet_imgnet_drpall'):
super(simpnet_imgnet_drpall, self).__init__()
self.cfg = {
'simpnet5m': [['C', 66], ['C', 128], ['C', 128], ['C', 128], ['C', 192], ['C', 192], ['C', 192], ['C', 192], ['C', 192], ['C', 288], ['P'], ['C', 288], ['C', 355], ['C', 432]],
'simpnet8m': [['C', 128], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 430], ['P'], ['C', 430], ['C', 455], ['C', 600]]}
self.scale = scale
self.networks = ['simpnet5m', 'simpnet8m']
self.network_idx = network_idx
self.mode = mode
self.strides = {1: [2, 2, 2, 1, 1], #s1
2: [2, 2, 1, 2, 1, 1], #s4
3: [2, 2, 1, 1, 2, 1], #s3
4: [2, 1, 2, 1, 2, 1], #s5
5: [2, 1, 2, 1, 2, 1, 1]}#s6
self.features = self._make_layers(scale)
self.classifier = nn.Linear(round(self.cfg[self.networks[network_idx]][-1][1] * scale), classes)
def load_my_state_dict(self, state_dict):
own_state = self.state_dict()
for name, param in state_dict.items():
name = name.replace('module.', '')
if name not in own_state:
continue
if isinstance(param, Parameter):
# backwards compatibility for serialized parameters
param = param.data
print("STATE_DICT: {}".format(name))
try:
own_state[name].copy_(param)
except:
print('While copying the parameter named {}, whose dimensions in the model are'
' {} and whose dimensions in the checkpoint are {}, ... Using Initial Params'.format(
name, own_state[name].size(), param.size()))
def forward(self, x):
out = self.features(x)
#Global Max Pooling
out = F.max_pool2d(out, kernel_size=out.size()[2:])
out = F.dropout2d(out, 0.01, training=False)
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out
def _make_layers(self, scale):
layers = []
input_channel = 3
idx = 0
for x in self.cfg[self.networks[self.network_idx]]:
if idx == len(self.strides[self.mode]) or x[0] == 'P':
layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False),
nn.Dropout2d(p=0.00)]
if x[0] != 'C':
continue
filters = round(x[1] * scale)
if idx < len(self.strides[self.mode]):
stride = self.strides[self.mode][idx]
else:
stride = 1
if idx in (len(self.strides[self.mode])-1, 9, 12):
layers += [nn.Conv2d(input_channel, filters, kernel_size=[3, 3], stride=(stride, stride), padding=(1, 1)),
nn.BatchNorm2d(filters, eps=1e-05, momentum=0.05, affine=True),
nn.ReLU(inplace=True)]
else:
layers += [nn.Conv2d(input_channel, filters, kernel_size=[3, 3], stride=(stride, stride), padding=(1, 1)),
nn.BatchNorm2d(filters, eps=1e-05, momentum=0.05, affine=True),
nn.ReLU(inplace=True),
nn.Dropout2d(p=0.000)]
input_channel = filters
idx += 1
model = nn.Sequential(*layers)
print(model)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.xavier_uniform_(m.weight.data, gain=nn.init.calculate_gain('relu'))
return model
and if that helps, here is a normal definition of the same network:
# def _make_layers(self, scale=1):
# model = nn.Sequential(
# nn.Conv2d(3, round(66*scale), kernel_size=[3, 3], stride=(2, 2), padding=(1, 1)),
# nn.BatchNorm2d(round(66*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.000),
# nn.Conv2d(round(66*scale), round(128*scale), kernel_size=[3, 3], stride=(2, 2), padding=(1, 1)),
# nn.BatchNorm2d(round(128*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.000),
# nn.Conv2d(round(128*scale), round(128*scale), kernel_size=[3, 3], stride=(2, 2), padding=(1, 1)),
# nn.BatchNorm2d(round(128*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.000),
# nn.Conv2d(round(128*scale), round(128*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(128*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.000),
# nn.Conv2d(round(128*scale), round(192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False),
# nn.Dropout2d(p=0.00),
# nn.Conv2d(round(192*scale), round(192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.0),
# nn.Conv2d(round(192*scale), round(192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.0),
# nn.Conv2d(round(192*scale),round( 192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.0),
# nn.Conv2d(round(192*scale), round(192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.0),
# nn.Conv2d(round(192*scale), round(288*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(288*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False),
# nn.Dropout2d(p=0.00),
# nn.Conv2d(round(288*scale), round(288*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(288*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.01),
# nn.Conv2d(round(288*scale), round(355*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(355*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# nn.Dropout2d(p=0.01),
# nn.Conv2d(round(355*scale), round(432*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
# nn.BatchNorm2d(round(432*scale), eps=1e-05, momentum=0.05, affine=True),
# nn.ReLU(inplace=True),
# )
# for m in self.modules():
# if isinstance(m, nn.Conv2d):
# nn.init.xavier_uniform_(m.weight.data, gain=nn.init.calculate_gain('relu'))
# return model
This is the gpustat on my system :
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,
GPU_IDs=0,1,2,3,4,5
(pytorch3.5) shisho@sama:~/MyProjects/shishosama/SimpleNetV2_Pytorch$ gpustat
sama Sat Dec 22 14:37:35 2018
[0] TITAN Xp | 36'C, 0 % | 0 / 12196 MB |
[1] TITAN Xp | 29'C, 0 % | 0 / 12196 MB |
[2] TITAN Xp | 33'C, 0 % | 0 / 12196 MB |
[3] TITAN Xp | 27'C, 0 % | 0 / 12196 MB |
[4] TITAN Xp | 21'C, 0 % | 0 / 12196 MB |
[5] TITAN Xp | 24'C, 0 % | 0 / 12196 MB |
[6] Quadro P6000 | 22'C, 0 % | 0 / 24449 MB |
[7] Quadro P6000 | 20'C, 0 % | 0 / 24449 MB |
and this is the full error log :
(pytorch3.5) shisho@sama:~/MyProjects/shishosama/SimpleNetV2_Pytorch$ ./training_sequence.sh
device is cuda:0
=> creating model 'simpnet_imgnet_drpall_s2_1.0'
Sequential(
(0): Conv2d(3, 66, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
(1): BatchNorm2d(66, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(2): ReLU(inplace)
(3): Dropout2d(p=0.0)
(4): Conv2d(66, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
(5): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(6): ReLU(inplace)
(7): Dropout2d(p=0.0)
(8): Conv2d(128, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(9): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(10): ReLU(inplace)
(11): Dropout2d(p=0.0)
(12): Conv2d(128, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
(13): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(14): ReLU(inplace)
(15): Dropout2d(p=0.0)
(16): Conv2d(128, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(17): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(18): ReLU(inplace)
(19): Dropout2d(p=0.0)
(20): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(21): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(22): ReLU(inplace)
(23): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
(24): Dropout2d(p=0.0)
(25): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(26): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(27): ReLU(inplace)
(28): Dropout2d(p=0.0)
(29): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(30): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(31): ReLU(inplace)
(32): Dropout2d(p=0.0)
(33): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(34): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(35): ReLU(inplace)
(36): Dropout2d(p=0.0)
(37): Conv2d(192, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(38): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(39): ReLU(inplace)
(40): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
(41): Dropout2d(p=0.0)
(42): Conv2d(288, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(43): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(44): ReLU(inplace)
(45): Dropout2d(p=0.0)
(46): Conv2d(288, 355, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(47): BatchNorm2d(355, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(48): ReLU(inplace)
(49): Dropout2d(p=0.0)
(50): Conv2d(355, 432, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(51): BatchNorm2d(432, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(52): ReLU(inplace)
)
=> Model : simpnet_imgnet_drpall(
(features): Sequential(
(0): Conv2d(3, 66, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
(1): BatchNorm2d(66, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(2): ReLU(inplace)
(3): Dropout2d(p=0.0)
(4): Conv2d(66, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
(5): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(6): ReLU(inplace)
(7): Dropout2d(p=0.0)
(8): Conv2d(128, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(9): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(10): ReLU(inplace)
(11): Dropout2d(p=0.0)
(12): Conv2d(128, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
(13): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(14): ReLU(inplace)
(15): Dropout2d(p=0.0)
(16): Conv2d(128, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(17): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(18): ReLU(inplace)
(19): Dropout2d(p=0.0)
(20): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(21): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(22): ReLU(inplace)
(23): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
(24): Dropout2d(p=0.0)
(25): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(26): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(27): ReLU(inplace)
(28): Dropout2d(p=0.0)
(29): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(30): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(31): ReLU(inplace)
(32): Dropout2d(p=0.0)
(33): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(34): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(35): ReLU(inplace)
(36): Dropout2d(p=0.0)
(37): Conv2d(192, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(38): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(39): ReLU(inplace)
(40): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
(41): Dropout2d(p=0.0)
(42): Conv2d(288, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(43): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(44): ReLU(inplace)
(45): Dropout2d(p=0.0)
(46): Conv2d(288, 355, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(47): BatchNorm2d(355, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(48): ReLU(inplace)
(49): Dropout2d(p=0.0)
(50): Conv2d(355, 432, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
(51): BatchNorm2d(432, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
(52): ReLU(inplace)
)
(classifier): Linear(in_features=432, out_features=1000, bias=True)
)
=> parameter : Namespace(arch='simpnet_imgnet_drpall', batch_size=6, data='/home/shisho/DATASETS/imagenet', epochs=900, evaluate=False, gpus=[0, 1, 2, 3, 4, 5], lr=0.045, momentum=0.9, netidx=0, prefix='2018-12-22-909', print_freq=200, resume='', save_dir='./snapshots/imagenet/simplenetv2s/5mil', scale=1.0, smode=2, start_epoch=0, train_dir_name='train/', val_dir_name='val/', weight_decay=4e-05, workers=24)
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
excessive contents removed....
================================================================
Total params: 11,813,786
Trainable params: 11,813,786
Non-trainable params: 0
----------------------------------------------------------------
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
excessive contents removed....
================================================================
Total params: 11,813,786
Trainable params: 11,813,786
Non-trainable params: 0
----------------------------------------------------------------
=> optimizer : RMSprop (
Parameter Group 0
alpha: 0.9
centered: False
eps: 1
lr: 0.045
momentum: 0.9
weight_decay: 4e-05
)'
FLOPs: 1716.95M, Params: 5.91M
==>>[2018-12-22 14:37:59] [Epoch=000/900] [Need: 00:00:00] [learning_rate=0.045918] [Best : Accuracy(T1/T5)=0.00/0.00, Error=100.00/100.00]
Traceback (most recent call last):
File "imagenet_train.py", line 656, in <module>
main()
File "imagenet_train.py", line 287, in main
tr_prec1, tr_prec5, tr_loss = train(train_loader, model, criterion, optimizer, epoch, log)
File "imagenet_train.py", line 365, in train
output = model(input_var)
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py", line 143, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py", line 153, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply
raise output
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker
output = module(*input, **kwargs)
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/shishosama/MyProjects/SeyyedHossein/SimpleNetV2_Pytorch/models/simpnet_imgnet_drpall.py", line 67, in forward
out = self.features(x)
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/container.py", line 92, in forward
input = module(input)
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 320, in forward
self.padding, self.dilation, self.groups)
RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)
The contents of training_sequence.sh
is as follows:
#SimpleNetV2-5Mil
NETWORK_IDX=0
SMODE=2
SCALE=1.0
SAVE_DIR=./snapshots/imagenet/simplenetv2s/5mil
CHECKPOINT=./snapshots/imagenet/simplenetv2s/5mil/chkpt_simplenet_imgnet_here!
# testing mode 1 and 2 for 5mil arch
for (( i=2; i >= 1; i-- ))
do
python imagenet_train.py $IMAGENET_DIR --train_dir_name $TRAINING_DIR --val_dir_name $VAL_DIR --arch $MODEL_NAME \
--save_dir $SAVE_DIR -j $WORKER --epochs $EPOCHS --batch-size $BATCH_SIZE --netidx $NETWORK_IDX --scale $SCALE --smode $i --gpus $GPU_IDs #--resume $CHECKPOINT
done
#SimpleNetV2-8Mil
NETWORK_IDX=1
SMODE=1
SCALE=1.0
SAVE_DIR=./snapshots/imagenet/simplenetv2s/8mil
CHECKPOINT=./snapshots/imagenet/simplenetv2s/8mil/chkpt_simplenet_imgnet_here!
python imagenet_train.py $IMAGENET_DIR --train_dir_name $TRAINING_DIR --val_dir_name $VAL_DIR --arch $MODEL_NAME \
--save_dir $SAVE_DIR -j $WORKER --epochs $EPOCHS --batch-size $BATCH_SIZE --netidx $NETWORK_IDX --scale $SCALE --smode $SMODE --gpus $GPU_IDs #--resume $CHECKPOINT
and finally this the training script I’m using for this :
What am I missing here?
Any help is greatly appreciated.