Hello,
I’m trying to train a CNN on my own dataset on a GPU. I’m read the tutorials and wrote my training code:
def train_model(model, device, dataloaders, criterion, optimizer, lr_scheduler, loss_fn_pos_weight, num_epochs=20):
since = time.time()
# send model to GPU
model = model.to(device)
train_loss_history = []
val_loss_history = []
train_acc_history = []
val_acc_history = []
train_mcc_history = []
val_mcc_history = []
best_model_wts = copy.deepcopy(model.state_dict())
best_mcc = 0.0
for epoch in range(num_epochs):
print(' ')
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 15)
epoch_labels_train = []
epoch_labels_val = []
epoch_labels = {'train':epoch_labels_train, 'val':epoch_labels_val}
epoch_preds_train = []
epoch_preds_val = []
epoch_preds = {'train':epoch_preds_train, 'val':epoch_preds_val}
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
# accumulated loss for a epoch
running_loss = 0.0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# forward
# Get model outputs and calculate loss
#labels = Variable(labels)
labels = labels.view(-1,1)
# calculate output of neural network
outputs = model(inputs)
# add a positive weight(see docs) to loss function
criterion.pos_weight = torch.tensor([loss_fn_pos_weight])
# loss = criterion(outputs, labels.float())
print(labels)
print(labels.float())
loss = criterion(outputs, labels.float())
......
Then I got the following error asking me to feed torch.FloatTensor to the loss function:
Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x2b6480e57550>>
Traceback (most recent call last):
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 399, in __del__
self._shutdown_workers()
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 378, in _shutdown_workers
self.worker_result_queue.get()
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/queues.py", line 344, in get
return _ForkingPickler.loads(res)
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd
fd = df.detach()
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py", line 58, in detach
return reduction.recv_handle(conn)
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/reduction.py", line 182, in recv_handle
return recvfds(s, 1)[0]
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/reduction.py", line 153, in recvfds
msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_LEN(bytes_size))
ConnectionResetError: [Errno 104] Connection reset by peer
Traceback (most recent call last):
File "transfer_ft_extract.py", line 116, in <module>
num_epochs = num_epochs
File "/home/wshi6/deep-learning/bionois_hydroph/control_vs_heme/train.py", line 70, in train_model
loss = criterion(outputs, labels.float())
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/loss.py", line 573, in forward
reduction=self.reduction)
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/functional.py", line 1653, in binary_cross_entropy_with_logits
log_weight = 1 + (pos_weight - 1) * target
RuntimeError: Expected object of type torch.FloatTensor but found type torch.cuda.FloatTensor for argument #2 'other'
The more weird thing is that If I don’t send my lables to GPU, i.e., feed loss function with troch.FloatTensor, I got the error asking me feed loss function torch.cuda.FloatTensor:
Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x2b49c2c61390>>
Traceback (most recent call last):
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 399, in __del__
self._shutdown_workers()
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 378, in _shutdown_workers
self.worker_result_queue.get()
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/queues.py", line 344, in get
return _ForkingPickler.loads(res)
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd
fd = df.detach()
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py", line 58, in detach
return reduction.recv_handle(conn)
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/reduction.py", line 182, in recv_handle
return recvfds(s, 1)[0]
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/multiprocessing/reduction.py", line 153, in recvfds
msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_LEN(bytes_size))
ConnectionResetError: [Errno 104] Connection reset by peer
Traceback (most recent call last):
File "transfer_ft_extract.py", line 116, in <module>
num_epochs = num_epochs
File "/home/wshi6/deep-learning/bionois_hydroph/control_vs_heme/train.py", line 70, in train_model
loss = criterion(outputs, labels.float())
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/loss.py", line 573, in forward
reduction=self.reduction)
File "/home/wshi6/.conda/envs/pytorch/lib/python3.6/site-packages/torch/nn/functional.py", line 1654, in binary_cross_entropy_with_logits
loss = input - input * target + log_weight * (max_val + ((-max_val).exp() + (-input - max_val).exp()).log())
RuntimeError: Expected object of type torch.cuda.FloatTensor but found type torch.FloatTensor for argument #2 'other'
Any clue about this?
Thanks.