Please see below code snippet and comments in print lines:
device = 'cuda'
input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device= device, requires_grad=True)
model = torch.nn.Conv2d(8, 4, 3)
print(input.shape) # torch.Size([2, 8, 4, 4])
input = input.contiguous(memory_format=torch.channels_last)
print(input.shape) # still torch.Size([2, 8, 4, 4]) - need [2, 4, 4, 8] (NHWC) here
model = model.to(memory_format=torch.channels_last)
print(model) # output= Conv2d(8, 4, kernel_size=(3, 3), stride=(1, 1))
model = model.to(device)
out = model(input)
print(out.shape) # output= torch.Size([2, 4, 2, 2]) | need [2, 2, 2, 4] (NHWC) here
print(out.is_contiguous(memory_format=torch.channels_last)) # Output: True