Getting error while trying to train a simple network on ImageNet using Multiple GPUs

Hello everyone, I’m trying to train a simple architecture on ImageNet on multiple GPUs.
it works fine with one GPU, however, when I tried to use multiple GPUS, it crashes with this error :

RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)

This is the architecture :

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

class simpnet_imgnet_drpall(nn.Module):
    """
    args: classes 
    scale 
    network_idx (0,1):simpnet5m, simpnet8m
    mode : stride mode (1,2,3,4,5) 


    """
    def __init__(self, classes=1000, scale=1.0, network_idx=0, mode=1, simpnet_name='simpnet_imgnet_drpall'):
        super(simpnet_imgnet_drpall, self).__init__()
        self.cfg = {
        'simpnet5m': [['C', 66], ['C', 128], ['C', 128], ['C', 128], ['C', 192], ['C', 192], ['C', 192], ['C', 192], ['C', 192], ['C', 288], ['P'], ['C', 288], ['C', 355], ['C', 432]],
        'simpnet8m': [['C', 128], ['C', 182], ['C', 182], ['C', 182], ['C', 182],  ['C', 182], ['C', 182], ['C', 182], ['C', 182], ['C', 430], ['P'], ['C', 430], ['C', 455], ['C', 600]]}
        self.scale = scale
        self.networks = ['simpnet5m', 'simpnet8m']
        self.network_idx = network_idx
        self.mode = mode
        self.strides = {1: [2, 2, 2, 1, 1],      #s1
                        2: [2, 2, 1, 2, 1, 1],   #s4
                        3: [2, 2, 1, 1, 2, 1],   #s3
                        4: [2, 1, 2, 1, 2, 1],   #s5
                        5: [2, 1, 2, 1, 2, 1, 1]}#s6

        self.features = self._make_layers(scale)
        self.classifier = nn.Linear(round(self.cfg[self.networks[network_idx]][-1][1] * scale), classes)

    def load_my_state_dict(self, state_dict):

        own_state = self.state_dict()

        for name, param in state_dict.items():
            name = name.replace('module.', '')
            if name not in own_state:
                continue
            if isinstance(param, Parameter):
                # backwards compatibility for serialized parameters
                param = param.data
            print("STATE_DICT: {}".format(name))
            try:
                own_state[name].copy_(param)
            except:
                print('While copying the parameter named {}, whose dimensions in the model are'
                      ' {} and whose dimensions in the checkpoint are {}, ... Using Initial Params'.format(
                    name, own_state[name].size(), param.size()))

    def forward(self, x):
        out = self.features(x)

        #Global Max Pooling
        out = F.max_pool2d(out, kernel_size=out.size()[2:]) 
        out = F.dropout2d(out, 0.01, training=False)

        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out


    def _make_layers(self, scale):
        layers = []
        input_channel = 3
        idx = 0
        
        for x in self.cfg[self.networks[self.network_idx]]:
            if idx == len(self.strides[self.mode]) or x[0] == 'P':
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False),
                           nn.Dropout2d(p=0.00)]
            if x[0] != 'C':
                continue
            filters = round(x[1] * scale)
            if idx < len(self.strides[self.mode]):
                stride = self.strides[self.mode][idx]
            else:
                stride = 1
            if idx in (len(self.strides[self.mode])-1, 9, 12):
                layers += [nn.Conv2d(input_channel, filters, kernel_size=[3, 3], stride=(stride, stride), padding=(1, 1)),
                           nn.BatchNorm2d(filters, eps=1e-05, momentum=0.05, affine=True),
                           nn.ReLU(inplace=True)]
            else:
                layers += [nn.Conv2d(input_channel, filters, kernel_size=[3, 3], stride=(stride, stride), padding=(1, 1)),
                           nn.BatchNorm2d(filters, eps=1e-05, momentum=0.05, affine=True),
                           nn.ReLU(inplace=True),
                           nn.Dropout2d(p=0.000)]
            input_channel = filters
            idx += 1

        model = nn.Sequential(*layers)
        print(model)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform_(m.weight.data, gain=nn.init.calculate_gain('relu'))
        return model

and if that helps, here is a normal definition of the same network:

    # def _make_layers(self, scale=1):

    #     model = nn.Sequential(
    #                          nn.Conv2d(3, round(66*scale), kernel_size=[3, 3], stride=(2, 2), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(66*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.000),

    #                          nn.Conv2d(round(66*scale), round(128*scale), kernel_size=[3, 3], stride=(2, 2), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(128*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.000),

    #                          nn.Conv2d(round(128*scale), round(128*scale), kernel_size=[3, 3], stride=(2, 2), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(128*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.000),

    #                          nn.Conv2d(round(128*scale), round(128*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(128*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.000),

    #                          nn.Conv2d(round(128*scale), round(192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),


    #                          nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False),
    #                          nn.Dropout2d(p=0.00),


    #                          nn.Conv2d(round(192*scale), round(192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.0),

    #                          nn.Conv2d(round(192*scale), round(192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.0),

    #                          nn.Conv2d(round(192*scale),round( 192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.0),

    #                          nn.Conv2d(round(192*scale), round(192*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(192*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.0),

    #                          nn.Conv2d(round(192*scale), round(288*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(288*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),


    #                          nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False),
    #                          nn.Dropout2d(p=0.00),


    #                          nn.Conv2d(round(288*scale), round(288*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(288*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.01),

    #                          nn.Conv2d(round(288*scale), round(355*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(355*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                          nn.Dropout2d(p=0.01),

    #                          nn.Conv2d(round(355*scale), round(432*scale), kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)),
    #                          nn.BatchNorm2d(round(432*scale), eps=1e-05, momentum=0.05, affine=True),
    #                          nn.ReLU(inplace=True),
    #                         )

    #     for m in self.modules():
    #         if isinstance(m, nn.Conv2d):
    #             nn.init.xavier_uniform_(m.weight.data, gain=nn.init.calculate_gain('relu'))
    #     return model

This is the gpustat on my system :

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,
GPU_IDs=0,1,2,3,4,5

(pytorch3.5) shisho@sama:~/MyProjects/shishosama/SimpleNetV2_Pytorch$ gpustat
sama  Sat Dec 22 14:37:35 2018
[0] TITAN Xp         | 36'C,   0 % |     0 / 12196 MB |
[1] TITAN Xp         | 29'C,   0 % |     0 / 12196 MB |
[2] TITAN Xp         | 33'C,   0 % |     0 / 12196 MB |
[3] TITAN Xp         | 27'C,   0 % |     0 / 12196 MB |
[4] TITAN Xp         | 21'C,   0 % |     0 / 12196 MB |
[5] TITAN Xp         | 24'C,   0 % |     0 / 12196 MB |
[6] Quadro P6000     | 22'C,   0 % |     0 / 24449 MB |
[7] Quadro P6000     | 20'C,   0 % |     0 / 24449 MB |

and this is the full error log :

(pytorch3.5) shisho@sama:~/MyProjects/shishosama/SimpleNetV2_Pytorch$ ./training_sequence.sh
device is  cuda:0
=> creating model 'simpnet_imgnet_drpall_s2_1.0'
Sequential(
  (0): Conv2d(3, 66, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
  (1): BatchNorm2d(66, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (2): ReLU(inplace)
  (3): Dropout2d(p=0.0)
  (4): Conv2d(66, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
  (5): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (6): ReLU(inplace)
  (7): Dropout2d(p=0.0)
  (8): Conv2d(128, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (9): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (10): ReLU(inplace)
  (11): Dropout2d(p=0.0)
  (12): Conv2d(128, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
  (13): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (14): ReLU(inplace)
  (15): Dropout2d(p=0.0)
  (16): Conv2d(128, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (17): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (18): ReLU(inplace)
  (19): Dropout2d(p=0.0)
  (20): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (21): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (22): ReLU(inplace)
  (23): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
  (24): Dropout2d(p=0.0)
  (25): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (26): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (27): ReLU(inplace)
  (28): Dropout2d(p=0.0)
  (29): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (30): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (31): ReLU(inplace)
  (32): Dropout2d(p=0.0)
  (33): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (34): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (35): ReLU(inplace)
  (36): Dropout2d(p=0.0)
  (37): Conv2d(192, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (38): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (39): ReLU(inplace)
  (40): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
  (41): Dropout2d(p=0.0)
  (42): Conv2d(288, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (43): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (44): ReLU(inplace)
  (45): Dropout2d(p=0.0)
  (46): Conv2d(288, 355, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (47): BatchNorm2d(355, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (48): ReLU(inplace)
  (49): Dropout2d(p=0.0)
  (50): Conv2d(355, 432, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (51): BatchNorm2d(432, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (52): ReLU(inplace)
)
=> Model : simpnet_imgnet_drpall(
  (features): Sequential(
    (0): Conv2d(3, 66, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
    (1): BatchNorm2d(66, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): Dropout2d(p=0.0)
    (4): Conv2d(66, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (6): ReLU(inplace)
    (7): Dropout2d(p=0.0)
    (8): Conv2d(128, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (10): ReLU(inplace)
    (11): Dropout2d(p=0.0)
    (12): Conv2d(128, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
    (13): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (14): ReLU(inplace)
    (15): Dropout2d(p=0.0)
    (16): Conv2d(128, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (17): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (18): ReLU(inplace)
    (19): Dropout2d(p=0.0)
    (20): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (21): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (22): ReLU(inplace)
    (23): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
    (24): Dropout2d(p=0.0)
    (25): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (26): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (27): ReLU(inplace)
    (28): Dropout2d(p=0.0)
    (29): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (30): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (31): ReLU(inplace)
    (32): Dropout2d(p=0.0)
    (33): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (34): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (35): ReLU(inplace)
    (36): Dropout2d(p=0.0)
    (37): Conv2d(192, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (38): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (39): ReLU(inplace)
    (40): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
    (41): Dropout2d(p=0.0)
    (42): Conv2d(288, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (43): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (44): ReLU(inplace)
    (45): Dropout2d(p=0.0)
    (46): Conv2d(288, 355, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (47): BatchNorm2d(355, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (48): ReLU(inplace)
    (49): Dropout2d(p=0.0)
    (50): Conv2d(355, 432, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
    (51): BatchNorm2d(432, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
    (52): ReLU(inplace)
  )
  (classifier): Linear(in_features=432, out_features=1000, bias=True)
)
=> parameter : Namespace(arch='simpnet_imgnet_drpall', batch_size=6, data='/home/shisho/DATASETS/imagenet', epochs=900, evaluate=False, gpus=[0, 1, 2, 3, 4, 5], lr=0.045, momentum=0.9, netidx=0, prefix='2018-12-22-909', print_freq=200, resume='', save_dir='./snapshots/imagenet/simplenetv2s/5mil', scale=1.0, smode=2, start_epoch=0, train_dir_name='train/', val_dir_name='val/', weight_decay=4e-05, workers=24)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
excessive contents removed....
================================================================
Total params: 11,813,786
Trainable params: 11,813,786
Non-trainable params: 0
----------------------------------------------------------------
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
excessive contents removed....
================================================================
Total params: 11,813,786
Trainable params: 11,813,786
Non-trainable params: 0
----------------------------------------------------------------
=> optimizer : RMSprop (
Parameter Group 0
    alpha: 0.9
    centered: False
    eps: 1
    lr: 0.045
    momentum: 0.9
    weight_decay: 4e-05
)'
FLOPs: 1716.95M, Params: 5.91M

==>>[2018-12-22 14:37:59] [Epoch=000/900] [Need: 00:00:00] [learning_rate=0.045918] [Best : Accuracy(T1/T5)=0.00/0.00, Error=100.00/100.00]
Traceback (most recent call last):
  File "imagenet_train.py", line 656, in <module>
    main()
  File "imagenet_train.py", line 287, in main
    tr_prec1, tr_prec5, tr_loss = train(train_loader, model, criterion, optimizer, epoch, log)
  File "imagenet_train.py", line 365, in train
    output = model(input_var)
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py", line 143, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py", line 153, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply
    raise output
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker
    output = module(*input, **kwargs)
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shishosama/MyProjects/SeyyedHossein/SimpleNetV2_Pytorch/models/simpnet_imgnet_drpall.py", line 67, in forward
    out = self.features(x)
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/container.py", line 92, in forward
    input = module(input)
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shishosama/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 320, in forward
    self.padding, self.dilation, self.groups)
RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)

The contents of training_sequence.sh is as follows:

#SimpleNetV2-5Mil

NETWORK_IDX=0
SMODE=2
SCALE=1.0

SAVE_DIR=./snapshots/imagenet/simplenetv2s/5mil
CHECKPOINT=./snapshots/imagenet/simplenetv2s/5mil/chkpt_simplenet_imgnet_here!

# testing mode 1 and 2 for 5mil arch
for (( i=2; i >= 1; i-- ))
do
python imagenet_train.py $IMAGENET_DIR --train_dir_name $TRAINING_DIR --val_dir_name $VAL_DIR --arch $MODEL_NAME \
--save_dir $SAVE_DIR -j $WORKER --epochs $EPOCHS --batch-size $BATCH_SIZE --netidx $NETWORK_IDX --scale $SCALE --smode $i --gpus $GPU_IDs #--resume $CHECKPOINT  
done


#SimpleNetV2-8Mil
NETWORK_IDX=1
SMODE=1
SCALE=1.0

SAVE_DIR=./snapshots/imagenet/simplenetv2s/8mil
CHECKPOINT=./snapshots/imagenet/simplenetv2s/8mil/chkpt_simplenet_imgnet_here!

python imagenet_train.py $IMAGENET_DIR --train_dir_name $TRAINING_DIR --val_dir_name $VAL_DIR --arch $MODEL_NAME \
--save_dir $SAVE_DIR -j $WORKER --epochs $EPOCHS --batch-size $BATCH_SIZE --netidx $NETWORK_IDX --scale $SCALE --smode $SMODE --gpus $GPU_IDs #--resume $CHECKPOINT

and finally this the training script I’m using for this :

What am I missing here?
Any help is greatly appreciated.

I found an unrelated bug:

model = nn.Sequential(*layers)
print(model)
for m in self.modules():
    if isinstance(m, nn.Conv2d):
        nn.init.xavier_uniform_(m.weight.data, gain=nn.init.calculate_gain('relu'))

Here, self.modules() doesn’t have your model yet, so you are not initializing with xavier uniform on the layers in model.

About your original question, this seems strange, and looks like it should not happen, your model definition looks fine.

I suspect that your load_my_state_dict is doing something funky, though I cant nail it yet.
If you dont load from a state_dict and start your model with random weights, does the error still happen?

1 Like

Thanks a lot for pointing that out.
I face this error when not using any pretrained model. This happens with random weights (training from scratch) (Actually I never use the load_my_state_dict for loading the weight. I should have removed it earlier to prevent any confusion)

For the initialization part, is this alright to do ? :

def init_weights(m):
    if type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight.data, gain=nn.init.calculate_gain('relu'))
        
model.apply(init_weights)

or should this simply does it:

for m in model.modules():
    if isinstance(m, nn.Conv2d):
        nn.init.xavier_uniform_(m.weight.data, gain=nn.init.calculate_gain('relu'))
return model

both / either of those weight initialization code snippets should work.

I tried to reproduce your issue locally, so I created a minimal script that wraps your model around DataParallel and runs it. So far I have no luck. Here’s the script I ran: https://gist.github.com/f7e9afbdc561a2cfa9a2c5bdf443aa8b

I’d suggest trying to minimize the problem until it shows up, maybe?

thanks, so that snippet worked just fine for you right?
Could this be attributed to the different GPUs that I have (TitanX and Quadro P6000) ?

I’m 100% sure this is not related to different GPUs.
Did the snippet fail for you?
If so, i’d ask you to double-check your PyTorch version. print(torch.__version__), is it 1.0.0?

Sorry for the delay. I couldn’t test it on the main system so far, but I did test it on my personal system. it ran just fine!(The main script fails on this system as well).
On my personal system, I have torch 0.4.1. a TitanXp and a GTX1060 .
Here is the log :

(pytorch-py3) shisho@shisho:~/Myprojects/systempr$ nvidia-smi
Wed Dec 26 12:12:45 2018      
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 106...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   51C    P0    23W /  N/A |    840MiB /  6078MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            Off  | 00000000:0C:00.0 Off |                  N/A |
| 23%   30C    P0    54W / 250W |      0MiB / 12196MiB |      4%      Default |
+-------------------------------+----------------------+----------------------+
                                                                              
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1392      G   /usr/lib/xorg/Xorg                           663MiB |
|    0      2314      G   compiz                                       174MiB |
+-----------------------------------------------------------------------------+
(pytorch-py3) shisho@shisho:~/Myprojects/systempr$ python foo.py
Sequential(
  (0): Conv2d(3, 66, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
  (1): BatchNorm2d(66, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (2): ReLU(inplace)
  (3): Dropout2d(p=0.0)
  (4): Conv2d(66, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
  (5): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (6): ReLU(inplace)
  (7): Dropout2d(p=0.0)
  (8): Conv2d(128, 128, kernel_size=[3, 3], stride=(2, 2), padding=(1, 1))
  (9): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (10): ReLU(inplace)
  (11): Dropout2d(p=0.0)
  (12): Conv2d(128, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (13): BatchNorm2d(128, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (14): ReLU(inplace)
  (15): Dropout2d(p=0.0)
  (16): Conv2d(128, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (17): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (18): ReLU(inplace)
  (19): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
  (20): Dropout2d(p=0.0)
  (21): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (22): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (23): ReLU(inplace)
  (24): Dropout2d(p=0.0)
  (25): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (26): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (27): ReLU(inplace)
  (28): Dropout2d(p=0.0)
  (29): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (30): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (31): ReLU(inplace)
  (32): Dropout2d(p=0.0)
  (33): Conv2d(192, 192, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (34): BatchNorm2d(192, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (35): ReLU(inplace)
  (36): Dropout2d(p=0.0)
  (37): Conv2d(192, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (38): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (39): ReLU(inplace)
  (40): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=(1, 1), ceil_mode=False)
  (41): Dropout2d(p=0.0)
  (42): Conv2d(288, 288, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (43): BatchNorm2d(288, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (44): ReLU(inplace)
  (45): Dropout2d(p=0.0)
  (46): Conv2d(288, 355, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (47): BatchNorm2d(355, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (48): ReLU(inplace)
  (49): Dropout2d(p=0.0)
  (50): Conv2d(355, 432, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1))
  (51): BatchNorm2d(432, eps=1e-05, momentum=0.05, affine=True, track_running_stats=True)
  (52): ReLU(inplace)
)
/home/shisho/.virtualenvs/pytorch-py3/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py:24: UserWarning:
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
  warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
(pytorch-py3) shisho@shisho:~/Myprojects/systempr$

And here is the system info in case its needed :

shisho@shisho:~$ nvidia-smi
Mon Dec 24 19:55:31 2018      
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 106...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   54C    P2    23W /  N/A |   1350MiB /  6078MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            Off  | 00000000:0C:00.0 Off |                  N/A |
| 24%   44C    P2    58W / 250W |    567MiB / 12196MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                              
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0      1630      G   /usr/lib/xorg/Xorg                           716MiB |
|    0      2634      G   compiz                                       157MiB |
|    0      3130      G   /usr/lib/firefox/firefox                       1MiB |
|    0     18774      C   python                                       471MiB |
|    1     18774      C   python                                       557MiB |
+-----------------------------------------------------------------------------+

(pytorch-py3) shisho@shisho:~/Myprojects/systempr/SimpleNetV2_Pytorch$ python
Python 3.5.2 (default, Nov 23 2017, 16:37:01)
[GCC 5.4.0 20160609] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
torch>>> torch.__version__
'0.4.1'
>>>

(pytorch-py3) shisho@shisho:~/Myprojects/systempr/SimpleNetV2_Pytorch$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Tue_Jun_12_23:07:04_CDT_2018
Cuda compilation tools, release 9.2, V9.2.148


(pytorch-py3) shisho@shisho:~/Myprojects/systempr/SimpleNetV2_Pytorch$ echo $PATH
/home/shisho/.virtualenvs/pytorch-py3/bin:/home/shisho/bin:/home/shisho/.local/bin:/usr/local/cuda-9.2/bin:/usr/local/MATLAB/R2018a/bin:/usr/local/cuda-9.2/bin:/usr/local/MATLAB/R2018a/bin:/home/shisho/bin:/home/shisho/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
(pytorch-py3) shisho@shisho:~/Myprojects/systempr/SimpleNetV2_Pytorch$ echo $LD_LIBRARY_PATH

Update :
I tried resnet18 and faced with the same exact error on the main system . so this is not a problem with the model, but the training script seems to do sth weird here!
Here is the log for resnet18 :

(pytorch3.5) shisho@o2:~/MyProjects/systempr/SimpleNetV2_Pytorch_3$ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 bash training_sequence.sh
=> creating model 'resnet18'
=> Python version : 3.5.2 (default, Nov 12 2018, 13:43:14)
=> Pytorch version : 1.0.0
=> CUDA Availability : True
=> CUDA version : 9.0.176
=> CuDNN version : 7401
=> Initial cuda seed : 5652157825403788
=> Available GPUs : 6
    => GPU(0): TITAN X (Pascal) (6,1) Total memory: 12189MB Multi processor count: 28
    => GPU(1): TITAN X (Pascal) (6,1) Total memory: 12189MB Multi processor count: 28
    => GPU(2): TITAN X (Pascal) (6,1) Total memory: 12189MB Multi processor count: 28
    => GPU(3): TITAN X (Pascal) (6,1) Total memory: 12189MB Multi processor count: 28
    => GPU(4): TITAN X (Pascal) (6,1) Total memory: 12189MB Multi processor count: 28
    => GPU(5): TITAN X (Pascal) (6,1) Total memory: 12189MB Multi processor count: 28
=> parameter : Namespace(arch='resnet18', batch_size=256, data='/home/shisho/DATASETS/tiny-imagenet-200', epochs=900, evaluate=False, lr=0.045, momentum=0.9, netidx=0, prefix='2018-12-27-5831', print_freq=200, resume='', save_dir='./snapshots/imagenet/', scale=0.0, smode=1, start_epoch=0, train_dir_name='train/', val_dir_name='val/', weight_decay=4e-05, workers=24)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 64, 112, 112]           9,408
            Conv2d-2         [-1, 64, 112, 112]           9,408
       BatchNorm2d-3         [-1, 64, 112, 112]             128
       BatchNorm2d-4         [-1, 64, 112, 112]             128
              ReLU-5         [-1, 64, 112, 112]               0
              ReLU-6         [-1, 64, 112, 112]               0
         MaxPool2d-7           [-1, 64, 56, 56]               0
         MaxPool2d-8           [-1, 64, 56, 56]               0
            Conv2d-9           [-1, 64, 56, 56]          36,864
           Conv2d-10           [-1, 64, 56, 56]          36,864
      BatchNorm2d-11           [-1, 64, 56, 56]             128
      BatchNorm2d-12           [-1, 64, 56, 56]             128
             ReLU-13           [-1, 64, 56, 56]               0
             ReLU-14           [-1, 64, 56, 56]               0
           Conv2d-15           [-1, 64, 56, 56]          36,864
           Conv2d-16           [-1, 64, 56, 56]          36,864
      BatchNorm2d-17           [-1, 64, 56, 56]             128
      BatchNorm2d-18           [-1, 64, 56, 56]             128
             ReLU-19           [-1, 64, 56, 56]               0
       BasicBlock-20           [-1, 64, 56, 56]               0
             ReLU-21           [-1, 64, 56, 56]               0
       BasicBlock-22           [-1, 64, 56, 56]               0
           Conv2d-23           [-1, 64, 56, 56]          36,864
           Conv2d-24           [-1, 64, 56, 56]          36,864
      BatchNorm2d-25           [-1, 64, 56, 56]             128
      BatchNorm2d-26           [-1, 64, 56, 56]             128
             ReLU-27           [-1, 64, 56, 56]               0
             ReLU-28           [-1, 64, 56, 56]               0
           Conv2d-29           [-1, 64, 56, 56]          36,864
           Conv2d-30           [-1, 64, 56, 56]          36,864
      BatchNorm2d-31           [-1, 64, 56, 56]             128
      BatchNorm2d-32           [-1, 64, 56, 56]             128
             ReLU-33           [-1, 64, 56, 56]               0
       BasicBlock-34           [-1, 64, 56, 56]               0
             ReLU-35           [-1, 64, 56, 56]               0
       BasicBlock-36           [-1, 64, 56, 56]               0
           Conv2d-37          [-1, 128, 28, 28]          73,728
           Conv2d-38          [-1, 128, 28, 28]          73,728
      BatchNorm2d-39          [-1, 128, 28, 28]             256
             ReLU-40          [-1, 128, 28, 28]               0
      BatchNorm2d-41          [-1, 128, 28, 28]             256
           Conv2d-42          [-1, 128, 28, 28]         147,456
             ReLU-43          [-1, 128, 28, 28]               0
      BatchNorm2d-44          [-1, 128, 28, 28]             256
           Conv2d-45          [-1, 128, 28, 28]         147,456
           Conv2d-46          [-1, 128, 28, 28]           8,192
      BatchNorm2d-47          [-1, 128, 28, 28]             256
      BatchNorm2d-48          [-1, 128, 28, 28]             256
           Conv2d-49          [-1, 128, 28, 28]           8,192
             ReLU-50          [-1, 128, 28, 28]               0
       BasicBlock-51          [-1, 128, 28, 28]               0
           Conv2d-52          [-1, 128, 28, 28]         147,456
      BatchNorm2d-53          [-1, 128, 28, 28]             256
      BatchNorm2d-54          [-1, 128, 28, 28]             256
             ReLU-55          [-1, 128, 28, 28]               0
             ReLU-56          [-1, 128, 28, 28]               0
       BasicBlock-57          [-1, 128, 28, 28]               0
           Conv2d-58          [-1, 128, 28, 28]         147,456
           Conv2d-59          [-1, 128, 28, 28]         147,456
      BatchNorm2d-60          [-1, 128, 28, 28]             256
      BatchNorm2d-61          [-1, 128, 28, 28]             256
             ReLU-62          [-1, 128, 28, 28]               0
             ReLU-63          [-1, 128, 28, 28]               0
       BasicBlock-64          [-1, 128, 28, 28]               0
           Conv2d-65          [-1, 128, 28, 28]         147,456
           Conv2d-66          [-1, 256, 14, 14]         294,912
      BatchNorm2d-67          [-1, 128, 28, 28]             256
      BatchNorm2d-68          [-1, 256, 14, 14]             512
             ReLU-69          [-1, 128, 28, 28]               0
       BasicBlock-70          [-1, 128, 28, 28]               0
             ReLU-71          [-1, 256, 14, 14]               0
           Conv2d-72          [-1, 256, 14, 14]         589,824
           Conv2d-73          [-1, 256, 14, 14]         294,912
      BatchNorm2d-74          [-1, 256, 14, 14]             512
      BatchNorm2d-75          [-1, 256, 14, 14]             512
           Conv2d-76          [-1, 256, 14, 14]          32,768
             ReLU-77          [-1, 256, 14, 14]               0
      BatchNorm2d-78          [-1, 256, 14, 14]             512
           Conv2d-79          [-1, 256, 14, 14]         589,824
             ReLU-80          [-1, 256, 14, 14]               0
       BasicBlock-81          [-1, 256, 14, 14]               0
      BatchNorm2d-82          [-1, 256, 14, 14]             512
           Conv2d-83          [-1, 256, 14, 14]         589,824
           Conv2d-84          [-1, 256, 14, 14]          32,768
      BatchNorm2d-85          [-1, 256, 14, 14]             512
             ReLU-86          [-1, 256, 14, 14]               0
      BatchNorm2d-87          [-1, 256, 14, 14]             512
           Conv2d-88          [-1, 256, 14, 14]         589,824
             ReLU-89          [-1, 256, 14, 14]               0
       BasicBlock-90          [-1, 256, 14, 14]               0
           Conv2d-91          [-1, 256, 14, 14]         589,824
      BatchNorm2d-92          [-1, 256, 14, 14]             512
      BatchNorm2d-93          [-1, 256, 14, 14]             512
             ReLU-94          [-1, 256, 14, 14]               0
       BasicBlock-95          [-1, 256, 14, 14]               0
             ReLU-96          [-1, 256, 14, 14]               0
           Conv2d-97            [-1, 512, 7, 7]       1,179,648
           Conv2d-98          [-1, 256, 14, 14]         589,824
      BatchNorm2d-99            [-1, 512, 7, 7]           1,024
     BatchNorm2d-100          [-1, 256, 14, 14]             512
            ReLU-101            [-1, 512, 7, 7]               0
            ReLU-102          [-1, 256, 14, 14]               0
      BasicBlock-103          [-1, 256, 14, 14]               0
          Conv2d-104            [-1, 512, 7, 7]       2,359,296
          Conv2d-105            [-1, 512, 7, 7]       1,179,648
     BatchNorm2d-106            [-1, 512, 7, 7]           1,024
     BatchNorm2d-107            [-1, 512, 7, 7]           1,024
            ReLU-108            [-1, 512, 7, 7]               0
          Conv2d-109            [-1, 512, 7, 7]         131,072
     BatchNorm2d-110            [-1, 512, 7, 7]           1,024
          Conv2d-111            [-1, 512, 7, 7]       2,359,296
            ReLU-112            [-1, 512, 7, 7]               0
      BasicBlock-113            [-1, 512, 7, 7]               0
     BatchNorm2d-114            [-1, 512, 7, 7]           1,024
          Conv2d-115            [-1, 512, 7, 7]       2,359,296
          Conv2d-116            [-1, 512, 7, 7]         131,072
     BatchNorm2d-117            [-1, 512, 7, 7]           1,024
            ReLU-118            [-1, 512, 7, 7]               0
     BatchNorm2d-119            [-1, 512, 7, 7]           1,024
          Conv2d-120            [-1, 512, 7, 7]       2,359,296
            ReLU-121            [-1, 512, 7, 7]               0
      BasicBlock-122            [-1, 512, 7, 7]               0
     BatchNorm2d-123            [-1, 512, 7, 7]           1,024
          Conv2d-124            [-1, 512, 7, 7]       2,359,296
            ReLU-125            [-1, 512, 7, 7]               0
      BasicBlock-126            [-1, 512, 7, 7]               0
       AvgPool2d-127            [-1, 512, 1, 1]               0
     BatchNorm2d-128            [-1, 512, 7, 7]           1,024
            ReLU-129            [-1, 512, 7, 7]               0
          Conv2d-130            [-1, 512, 7, 7]       2,359,296
     BatchNorm2d-131            [-1, 512, 7, 7]           1,024
            ReLU-132            [-1, 512, 7, 7]               0
      BasicBlock-133            [-1, 512, 7, 7]               0
          Linear-134                 [-1, 1000]         513,000
       AvgPool2d-135            [-1, 512, 1, 1]               0
          ResNet-136                 [-1, 1000]               0
          Linear-137                 [-1, 1000]         513,000
          ResNet-138                 [-1, 1000]               0
================================================================
Total params: 23,379,024
Trainable params: 23,379,024
Non-trainable params: 0
----------------------------------------------------------------
=> optimizer : RMSprop (
Parameter Group 0
    alpha: 0.9
    centered: False
    eps: 1
    lr: 0.045
    momentum: 0.9
    weight_decay: 4e-05
)'
FLOPs: 1816.41M, Params: 11.69M

==>>[2018-12-27 12:29:02] [Epoch=000/900] [Need: 00:00:00] [learning_rate=0.045918] [Best : Accuracy(T1/T5)=0.00/0.00, Error=100.00/100.00]
Traceback (most recent call last):
  File "imagenet_train.py", line 657, in <module>
    main()
  File "imagenet_train.py", line 288, in main
    tr_prec1, tr_prec5, tr_loss = train(train_loader, model, criterion, optimizer, epoch, log)
  File "imagenet_train.py", line 366, in train
    output = model(input_var)
  File "/home/shisho/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shisho/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py", line 143, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/shisho/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/data_parallel.py", line 153, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/shisho/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply
    raise output
  File "/home/shisho/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker
    output = module(*input, **kwargs)
  File "/home/shisho/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shisho/MyProjects/systempr/SimpleNetV2_Pytorch_3/models/imagenet_resnet.py", line 125, in forward
    x = self.conv1(x)

  File "/home/shisho/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 489, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/shisho/.virtualenvs/pytorch3.5/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 320, in forward
    self.padding, self.dilation, self.groups)
RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)