Hi, I’m trying to train a pre-trained model, with my GPU. Everything worked well but I wanted to code a class in order to have a cleaner code. On my class I do the same operations that I was doing without my class, but I have an issue that I don’t understand, with the line model(images, targets)
:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-10-6af86cd2bf74> in <module>
----> 1 class_model.train(num_epoch = 1)
<ipython-input-6-700e8ee37055> in train(self, num_epoch, gpu)
133
134 # Train for one epoch, printing every 10 iterations
--> 135 train_his_, list_losses, list_losses_dict = train_one_epoch(model, optimizer, self.data_loader, device, epoch, print_freq=10)
136 list_of_list_losses.append(list_losses)
137 # Compute losses over the validation set
<ipython-input-2-11a7da6d9e67> in train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq)
508
509 # Feed the training samples to the model and compute the losses
--> 510 loss_dict = model(images, targets)
511 losses = sum(loss for loss in loss_dict.values())
512
~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~/anaconda3/lib/python3.8/site-packages/torchvision/models/detection/generalized_rcnn.py in forward(self, images, targets)
91 .format(degen_bb, target_idx))
92
---> 93 features = self.backbone(images.tensors)
94 if isinstance(features, torch.Tensor):
95 features = OrderedDict([('0', features)])
~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~/anaconda3/lib/python3.8/site-packages/torchvision/models/detection/backbone_utils.py in forward(self, x)
42
43 def forward(self, x):
---> 44 x = self.body(x)
45 x = self.fpn(x)
46 return x
~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~/anaconda3/lib/python3.8/site-packages/torchvision/models/_utils.py in forward(self, x)
60 out = OrderedDict()
61 for name, module in self.items():
---> 62 x = module(x)
63 if name in self.return_layers:
64 out_name = self.return_layers[name]
~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/conv.py in forward(self, input)
441
442 def forward(self, input: Tensor) -> Tensor:
--> 443 return self._conv_forward(input, self.weight, self.bias)
444
445 class Conv3d(_ConvNd):
~/anaconda3/lib/python3.8/site-packages/torch/nn/modules/conv.py in _conv_forward(self, input, weight, bias)
437 weight, bias, self.stride,
438 _pair(0), self.dilation, self.groups)
--> 439 return F.conv2d(input, weight, bias, self.stride,
440 self.padding, self.dilation, self.groups)
441
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.cuda.DoubleTensor) should be the same
My device is already cuda:0.
The data are called on this function basically from the engine librairie. I tried to cast into a torch.float but it didn’t work
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
model.train()
metric_logger = utilss.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utilss.SmoothedValue(window_size=1, fmt='{value:.6f}'))
header = 'Epoch: [{}]'.format(epoch)
list_losses = []
list_losses_dict = []
for i, values in tqdm(enumerate(metric_logger.log_every(data_loader, print_freq, header))):
images, targets = values
print(type(images))
images = list(image.to(device, dtype=torch.float) for image in images)
print(type(images))
targets = [{k: v.to(device, dtype = torch.float) for k, v in t.items()} for t in targets]
#images = [image.cuda() for image in images]
# Feed the training samples to the model and compute the losses
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utilss.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
print("Loss is {}, stopping training".format(loss_value))
if not math.isfinite(loss_value):
print("Loss is {}, stopping training".format(loss_value))
print(loss_dict_reduced)
sys.exit(1)
list_losses.append(loss_value)
# Pytorch function to initialize optimizer
optimizer.zero_grad()
# Compute gradients or the backpropagation
losses.backward()
# Update current gradient
optimizer.step()
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
# Record losses to plot learning curves
if i == 0:
history = {key: val.cpu().detach() for key, val in loss_dict_reduced.items()}
history['loss'] = losses_reduced.cpu().detach()
else:
for key, val in loss_dict_reduced.items():history[key] += val.cpu().detach()
history['loss'] += losses_reduced.cpu().detach()
torch.save(model.state_dict(), FILE_model_dict_gpu)
list_losses_dict.append(loss_dict_reduced)
save_obj(history, "history_train_fixed_frame_lab2_and_lab7_5epoch_07-07")
plt.plot([x for x in range(len(list_losses))],list_losses)
plt.show()
return history, list_losses, list_losses_dict
Thanks for your help !