RuntimeError when sending tensors to CUDA


#1

Hello,
I’m trying to train a CNN on my own dataset on a GPU. I’m read the tutorials and wrote my training code:

def train_model(model, device, dataloaders, criterion, optimizer, lr_scheduler, loss_fn_pos_weight, num_epochs=20):
    since = time.time()

    # send model to GPU
    model = model.to(device)

    train_loss_history = []
    val_loss_history = []
    train_acc_history = []
    val_acc_history = []
    train_mcc_history = []
    val_mcc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_mcc = 0.0

    for epoch in range(num_epochs):
        print(' ')
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 15)

        epoch_labels_train = []
        epoch_labels_val = []
        epoch_labels = {'train':epoch_labels_train, 'val':epoch_labels_val}
        epoch_preds_train = []
        epoch_preds_val = []
        epoch_preds = {'train':epoch_preds_train, 'val':epoch_preds_val}

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            # accumulated loss for a epoch
            running_loss = 0.0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # forward
                # Get model outputs and calculate loss
                #labels = Variable(labels)
                labels = labels.view(-1,1)

                # calculate output of neural network
                outputs = model(inputs)

                # add a positive weight(see docs) to loss function
                criterion.pos_weight = torch.tensor([loss_fn_pos_weight])
                # loss = criterion(outputs, labels.float())
                print(labels)
                print(labels.float())
                loss = criterion(outputs, labels.float())
                ......

Then I got the following error asking me to feed torch.FloatTensor to the loss function:

Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x2b6480e57550>>
Traceback (most recent call last):
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 399, in __del__
    self._shutdown_workers()
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 378, in _shutdown_workers
    self.worker_result_queue.get()
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/queues.py", line 344, in get
    return _ForkingPickler.loads(res)
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd
    fd = df.detach()
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py", line 58, in detach
    return reduction.recv_handle(conn)
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/reduction.py", line 182, in recv_handle
    return recvfds(s, 1)[0]
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/reduction.py", line 153, in recvfds
    msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_LEN(bytes_size))
ConnectionResetError: [Errno 104] Connection reset by peer
Traceback (most recent call last):
  File "transfer_ft_extract.py", line 116, in <module>
    num_epochs = num_epochs
  File "/home/wshi6/deep-learning/bionois_hydroph/control_vs_heme/train.py", line 70, in train_model
    loss = criterion(outputs, labels.float())
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/loss.py", line 573, in forward
    reduction=self.reduction)
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/functional.py", line 1653, in binary_cross_entropy_with_logits
    log_weight = 1 + (pos_weight - 1) * target
RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #2 'other'

The more weird thing is that If I don’t send my lables to GPU, i.e., feed loss function with troch.FloatTensor, I got the error asking me feed loss function torch.cuda.FloatTensor:

Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x2b49c2c61390>>
Traceback (most recent call last):
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 399, in __del__
    self._shutdown_workers()
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 378, in _shutdown_workers
    self.worker_result_queue.get()
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/queues.py", line 344, in get
    return _ForkingPickler.loads(res)
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd
    fd = df.detach()
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py", line 58, in detach
    return reduction.recv_handle(conn)
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/reduction.py", line 182, in recv_handle
    return recvfds(s, 1)[0]
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/reduction.py", line 153, in recvfds
    msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_LEN(bytes_size))
ConnectionResetError: [Errno 104] Connection reset by peer
Traceback (most recent call last):
  File "transfer_ft_extract.py", line 116, in <module>
    num_epochs = num_epochs
  File "/home/wshi6/deep-learning/bionois_hydroph/control_vs_heme/train.py", line 70, in train_model
    loss = criterion(outputs, labels.float())
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/loss.py", line 573, in forward
    reduction=self.reduction)
  File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/functional.py", line 1654, in binary_cross_entropy_with_logits
    loss = input - input * target + log_weight * (max_val + ((-max_val).exp() + (-input - max_val).exp()).log())
RuntimeError: Expected object of type torch.cuda.FloatTensor but found type torch.FloatTensor for argument #2 'other'

Any clue about this?
Thanks.


(Zhaomang Sun) #2

criterion.pos_weight = torch.tensor([loss_fn_pos_weight])
The above line ‘criterion.pos_weight’ is not defined on GPU, it should be written like this:
criterion.pos_weight = torch.tensor([loss_fn_pos_weight], device=device)


#3

That’s exactly where the problem is. Thank you.