pin_memory=True but data unavailable on GPU

I have a code section which creates a data loader as follows:

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

This is essentially the same loader that I’ve been using in other files without problem. Similarly, the code within the validate method in question is:

for i, (input, target) in enumerate(val_loader):
    target = target.cuda(device=None, non_blocking=False)
    with torch.no_grad():
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        with torch.cuda.amp.autocast():
            output = model(input_var)
            loss = criterion(output, target_var)

Yet, despite the pinned memory, it throws the following error stack when I attempt to run it:

RuntimeError Traceback (most recent call last)
in
238
239 if name == ‘main’:
→ 240 main()

in main()
126 for run in range(1):
127 # evaluate on validation set
→ 128 acc = validate(val_loader, model, criterion)
129 prec1 = prec1 + acc[0]
130 prec5 = prec5 + acc[1]

in validate(val_loader, model, criterion)
159
160 with torch.cuda.amp.autocast():
→ 161 output = model(input_var)
162 loss = criterion(output, target_var)
163 # scaler.unscale(loss)

~.conda\envs\pytorch\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
→ 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),

~\Documents\Pytorch\COCO Autoencoder\places365\caffemodel2pytorch.py in forward(self, data, **variables)
119 assert name in variables, ‘Variable [{}] does not exist. Pass it as a keyword argument or provide a layer which produces it.’.format(name)
120 inputs = [variables[name] if propagate_down else variables[name].detach() for name, propagate_down in zip(module.caffe_input_variable_names, module.caffe_propagate_down)]
→ 121 outputs = module(*inputs)
122 if not isinstance(outputs, tuple):
123 outputs = (outputs, )

~.conda\envs\pytorch\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
→ 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),

~\Documents\Pytorch\COCO Autoencoder\places365\caffemodel2pytorch.py in forward(self, x)
330 convert_to_gpu_if_enabled(self)
331 init_weight_bias(self, requires_grad = requires_grad)
→ 332 return super(Convolution, self).forward(x)
333
334 def set_parameters(self, weight = None, bias = None):

~.conda\envs\pytorch\lib\site-packages\torch\nn\modules\conv.py in forward(self, input)
421
422 def forward(self, input: Tensor) → Tensor:
→ 423 return self._conv_forward(input, self.weight)
424
425 class Conv3d(_ConvNd):

~.conda\envs\pytorch\lib\site-packages\torch\nn\modules\conv.py in _conv_forward(self, input, weight)
417 weight, self.bias, self.stride,
418 _pair(0), self.dilation, self.groups)
→ 419 return F.conv2d(input, weight, self.bias, self.stride,
420 self.padding, self.dilation, self.groups)
421

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.HalfTensor) should be the same

This is despite having identical code for loading data and feeding it to the model that works perfectly fine, with the sole difference being in the model loading–but the model is correctly on the GPU, rather than the data?

The problem appears to be because the new model was moved onto the GPU using model.cuda() rather than model = torch.nn.DataParallel(model).cuda(), though I don’t know why this is the case (or what is correct practice now, as this is based on quite old code).

Edit: Rather, what leaves me more confused now is how the latter causes the inputs to be copied to the GPU automatically, whereas pinning memory just reserves space for fast transfers in main memory…