Type mismatch on model when using GPU

Hello I am writing a small pytorch example with a simple NN. The program runs fine if I declare

dtype = torch.FloatTensor
#dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

The code currently runs great with the CPU option. However, as soon as I uncomment and switch to the GPU option, the code crashes when I try to run forward on a model

y_estimate = NN(x)				# Forward pass

With the following error :
File “./test.py”, line 44, in
y_estimate = NN(x) # Forward pass
File “/home/chieh/App/miniconda/lib/python2.7/site-packages/torch/nn/modules/module.py”, line 325, in call
result = self.forward(*input, **kwargs)
File “/home/chieh/App/miniconda/lib/python2.7/site-packages/torch/nn/modules/container.py”, line 67, in forward
input = module(input)
File “/home/chieh/App/miniconda/lib/python2.7/site-packages/torch/nn/modules/module.py”, line 325, in call
result = self.forward(*input, **kwargs)
File “/home/chieh/App/miniconda/lib/python2.7/site-packages/torch/nn/modules/linear.py”, line 55, in forward
return F.linear(input, self.weight, self.bias)
File “/home/chieh/App/miniconda/lib/python2.7/site-packages/torch/nn/functional.py”, line 835, in linear
return torch.addmm(bias, input, weight.t())
RuntimeError: Expected object of type Variable[torch.FloatTensor] but found type Variable[torch.cuda.FloatTensor] for argument #1 ‘mat1’

I guess I just need to change the input type for the model, but i'm not sure how to do it. Any help would be great. Here is the full code

Thank you.

1 Like

Your network is still on cpu. Add NN = NN.cuda().

11 Likes

yep, that was it, thank you

Hi guys, I am encountering a similar issue:

RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #2 ‘weight’

I made sure that my model is on gpu by invoking model = model.cuda() and I still get the above error message. Any hint as of why this is happening is greatly appreciated.

Best

1 Like

Could you post your model and training script?

Sure,
here is the function to modify a pretrained resnet to two classes, I couldn’t get the indentation to work here, so, it will be relatively harder to read.

def get_modified_pretrained_model(name):
describe_model = 'Basic ’ + name + ’ that outputs 2 rather than 1000 classes!'
if name == ‘resnet18’:
net = models.resnet18(pretrained=True)
if name == ‘resnet34’:
net = models.resnet34(pretrained=True)
if name == ‘resnet50’:
net = models.resnet50(pretrained=True)
if name == ‘resnet101’:
net = models.resnet101(pretrained=True)
if name == ‘resnet152’:
net = models.resnet152(pretrained=True)
num_ftrs = net.fc.in_features
net.fc = nn.Sequential(
nn.Linear(num_ftrs, 2)
)
return net, describe_model

here is the training protocol:

def training_protocol(model):
describe_training_protocol = ‘Modified training_protocol with nn.CrossEntropyLoss(), optim.SGD(model_ft.parameters(), lr = 0.0001, momentum=0.9, weight_decay = 0.00001)’

criterion = nn.CrossEntropyLoss().cuda()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model.parameters(), lr = 0.0001, momentum=0.9, weight_decay = 0.00001)
return criterion, optimizer_ft, describe_training_protocol

here is the training function:

def train_model(dataloaders, dataset_sizes, model, criterion, optimizer, num_epochs = 10, temp_save_name = None):
since = time.time()

best_model_wts = model.state_dict()
best_log_loss = 1
model = model.cuda()
for epoch in range(1, num_epochs+1):
    print('Epoch {}/{}'.format(epoch, num_epochs))
    print('*' * 10)

    # Each epoch has a training and validation phase phase
    model.train(True)  # Set model to training mode
    # Iterate over data.
    for i, (input, target) in enumerate(dataloaders['train']):

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # epoch statistics
    print_training_set_performance(dataset_sizes, dataloaders['train'], model)
    print('----------')
    epoch_sk_log_loss = print_val_set_performance(dataset_sizes, dataloaders['val'], model)
    # deep copy the model if better logloss
    if epoch_sk_log_loss < best_log_loss:
        best_log_loss = epoch_sk_log_loss
        best_model_wts = model.state_dict()
        if temp_save_name is not None:
            print('model is saved after epcoh ' + str(epoch))
            name = temp_save_name + '_' + '.pth'
            torch.save(model, name)
    print()

time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))
print('Best val logloss: {:4f}'.format(best_log_loss))

# load best model weights
model.load_state_dict(best_model_wts)
return model

here is the full error message:

Traceback (most recent call last):
File “finetunacuda.py”, line 322, in
main()
File “finetunacuda.py”, line 320, in main
fine_tuna_protocol()
File “finetunacuda.py”, line 298, in fine_tuna_protocol
model_ft = train_model(dataloaders, dataset_sizes, model_ft, criterion, optimizer_ft, num_epochs = nep, temp_save_name = name_of_results_output_txt_file)
File “finetunacuda.py”, line 238, in train_model
output = model(input_var)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torch/nn/modules/module.py”, line 325, in call
result = self.forward(*input, **kwargs)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torchvision/models/resnet.py”, line 139, in forward
x = self.conv1(x)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torch/nn/modules/module.py”, line 325, in call
result = self.forward(*input, **kwargs)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torch/nn/modules/conv.py”, line 277, in forward
self.padding, self.dilation, self.groups)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torch/nn/functional.py”, line 90, in conv2d
return f(input, weight, bias)
RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #2 ‘weight’

So, as you can see, the issue obtains during the forward phase of the training.
Thanks a lot for the assistance

You should make input cuda in the same way as you did target :slight_smile:

Hi Simon, thanks a lot for the reply, I totally agree with you, but first, I am trying to reproduce the formal imagenet tutorial: https://github.com/pytorch/examples/blob/master/imagenet/main.py

Second, when I changed the iteration part in the train_model function as follows per your feedback:

    # Iterate over data.
    for i, (inputs, target) in enumerate(dataloaders['train']):

        target = target.cuda(async=True)
        inputs = inputs.cuda(async=True)
        input_var = torch.autograd.Variable(inputs)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Now, I get a new error:

Traceback (most recent call last):
File “finetunacuda.py”, line 299, in
main()
File “finetunacuda.py”, line 297, in main
fine_tuna_protocol()
File “finetunacuda.py”, line 275, in fine_tuna_protocol
model_ft = train_model(dataloaders, dataset_sizes, model_ft, criterion, optimizer_ft, num_epochs = nep, temp_save_name = name_of_results_output_txt_file)
File “finetunacuda.py”, line 215, in train_model
output = model(input_var)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torch/nn/modules/module.py”, line 325, in call
result = self.forward(*input, **kwargs)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torchvision/models/resnet.py”, line 142, in forward
x = self.maxpool(x)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torch/nn/modules/module.py”, line 325, in call
result = self.forward(*input, **kwargs)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torch/nn/modules/pooling.py”, line 143, in forward
self.return_indices)
File “/home/ubuntu/envs/deepL/lib/python3.5/site-packages/torch/nn/functional.py”, line 334, in max_pool2d
ret = torch._C._nn.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
RuntimeError: cuda runtime error (2) : out of memory at /pytorch/torch/lib/THC/generic/THCStorage.cu:58

What do you think? I run this on a p2.xlarge EC2 instance on AWS.

Thanks a lot for the assistance
Hassan

And by the way, the above memory issue obtains even with a batchsize of 1! So, I assure you this has nothing to do with batchsize, the input is standard 3 x 224 x 224 images.

Do you have cuDNN installed?

torch.cuda.is_available() evaluates to True and torch.version evaluates to
’0.3.0.post4’ on the EC2 instance. Does this imply cuddNN is properly installed? I saw somewhere on the forum that if torch is installed from the pytorch website, it should come with all necessary cuda libraries. I used

pip3 install http://download.pytorch.org/whl/cu90/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl

to install the last version of torch on the instance.

Just found out that cuddNN is different than cuda. Will install cuddNN on the instance and get back with the results. Thanks a lot for the feedback.

Cool! cuDNN’s conv is faster and more efficient. Please try that and let us know the results :slight_smile:

I have a question, doesn’t pytorch installation come with shipped cuda and cuddNN installation?

The issue of runtime error out of memory still persists even after installing cuda and cuddNN manually, is there any tutorial that I can follow to make sure I have compatible cuda/cuddNN/pytorch/nvidia/ec2/aws/ubunut16 versions?

Hi everybody,

After going no-where with trying to set up cuda and cuddnn on my own, my mentor recommended I use the aws deep learning ami, which I did. This instance comes with preconfigured and installed python, pytorch, cuda, cuddNN.

  1. I followed the steps in this tutorial to launch the instance: https://aws.amazon.com/blogs/ai/get-started-with-deep-learning-using-the-aws-deep-learning-ami/

  2. After starting the instance I confirmed everything is properly installed as follows:

  • source activate pytorch_p27
  • python
  • import torch
  • torch.version
    ’0.3.0.post4’
  • torch.cuda.is_available()
    True
  • torch.backends.cudnn.version()
    7003

I am still getting the runtime error out of memory trying to finetune resnet18 with a batchsize of 1 and I use images with standard size of 3x224x224:

THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1512378422383/work/torch/lib/THC/generic/THCStorage.cu line=58 error=2 : out of memory
Traceback (most recent call last):
File “finetunacuda.py”, line 299, in
main()
File “finetunacuda.py”, line 297, in main
fine_tuna_protocol()
File “finetunacuda.py”, line 275, in fine_tuna_protocol
model_ft = train_model(dataloaders, dataset_sizes, model_ft, criterion, optimizer_ft, num_epochs = nep, temp_save_name = name_of_results_output_txt_file)
File “finetunacuda.py”, line 215, in train_model
output = model(input_var)
File “/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/nn/modules/module.py”, line 325, in call
result = self.forward(*input, **kwargs)
File “build/bdist.linux-x86_64/egg/torchvision/models/resnet.py”, line 142, in forward
File “/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/nn/modules/module.py”, line 325, in call
result = self.forward(*input, **kwargs)
File “/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/nn/modules/pooling.py”, line 143, in forward
self.return_indices)
File “/home/ubuntu/anaconda3/envs/pytorch_p27/lib/python2.7/site-packages/torch/nn/functional.py”, line 334, in max_pool2d
ret = torch._C._nn.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1512378422383/work/torch/lib/THC/generic/THCStorage.cu:58

I did everything I got my hands on, right now, I have no idea why I can’t get to run.
Here is the output of nvidia-smi
Wed Jan 3 03:58:23 2018
±----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81 Driver Version: 384.81 |
|-------------------------------±---------------------±---------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla K80 On | 00000000:00:1E.0 Off | 0 |
| N/A 44C P8 30W / 149W | 1MiB / 11439MiB | 0% Default |
±------------------------------±---------------------±---------------------+

±----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
±----------------------------------------------------------------------------+

Any feedback is greatly appreciated. I am posting this issue here, in case I couldn’t get any feedback I will have to start a new post.

Thanks

Hi folks, I was able to run the experiments cited above by using jupyter notebook on the same instance! This is something I noticed on both my local macbook air and the aws deep learning instance, running the gpu-training using jupyter notebook is fine, running the same experiments using python .py scripts lead to runtime out of memory error! Does this hint towards any possibility of memory leaks?

Sorrry to hijack this post, but is there a way to use condition to automatically switch between gpu and cpu depending on the hardware available ?
Sometimes I am working on my macbook, and I am trying to find a way not to modify this part every time.

I found it is possible to use parsing to detect GPU availability, but I don’t know how to do that on network.

http://pytorch.org/docs/master/notes/cuda.html

import argparse
import torch

parser = argparse.ArgumentParser(description='PyTorch Example')
parser.add_argument('--disable-cuda', action='store_true', help='Disable CUDA')
args = parser.parse_args()
args.cuda = not args.disable_cuda and torch.cuda.is_available()

if args.cuda:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')
        # something like torch.network.cuda() ??   

##Your model below
2 Likes

Currently, you have to build the network before transferring it to GPU via.cuda. So there isn’t yet a command that makes subsequent nn.* calls building layers on cuda. However, if you keep a collection of modules, it is quite easy to transfer them to GPU altogether after building them.

1 Like

torch.set_default_tensor_type('torch.cuda.FloatTensor')
This worked for me!

3 Likes

Hi Simon:joy:, I am just encountering a very similar issue while I have already put my network on cuda by

model = torch.nn.DataParallel(model).cuda()

and my input and target on cuda by

inputs, targets = inputs.cuda(), targets.cuda(async=True)

while I still have the problem of

RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #4 ‘other’

If that means there are still something I forget? Thank you for your time to read my question!

The full error message:

File “cifar.py”, line 347, in
main()
File “cifar.py”, line 202, in main
train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda)
File “cifar.py”, line 247, in train
outputs = model(inputs)
File “D:\pycharm\WORKS\venv\lib\site-packages\torch\nn\modules\module.py”, line 491, in call
result = self.forward(*input, **kwargs)
File “D:\pycharm\WORKS\venv\lib\site-packages\torch\nn\parallel\data_parallel.py”, line 112, in forward
return self.module(*inputs[0], **kwargs[0])
File “D:\pycharm\WORKS\venv\lib\site-packages\torch\nn\modules\module.py”, line 491, in call
result = self.forward(*input, **kwargs)
File “D:\pycharm\WORKS\VSBNet_pytorch\pytorch-classification-master\models\cifar\vgg_bi.py”, line 32, in forward
x = self.features(x)
File “D:\pycharm\WORKS\venv\lib\site-packages\torch\nn\modules\module.py”, line 491, in call
result = self.forward(*input, **kwargs)
File “D:\pycharm\WORKS\venv\lib\site-packages\torch\nn\modules\container.py”, line 91, in forward
input = module(input)
File “D:\pycharm\WORKS\venv\lib\site-packages\torch\nn\modules\module.py”, line 491, in call
result = self.forward(*input, **kwargs)
File “D:\pycharm\WORKS\venv\lib\site-packages\torch\nn\modules\conv.py”, line 168, in forward
torch.sum(tmp_tensor)
RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #4 ‘other’

and my code (I am doing some modify in the source code “conv.py” in torch.nn.Module):

self.Bi_weight[channal] += torch.sign(tmp_tensor * self.weight[channal]) *
torch.sum(tmp_tensor * torch.abs(self.weight[channal])) /
torch.sum(tmp_tensor)

tmp_tensor = ((torch.abs(
self.weight[channal]) >= sep_point(i, self.group_num)) *
(torch.abs(self.weight[channal]) < sep_point(i+1, self.group_num))).float().cuda()