TypeError: Expected input images to be of float found torch.uint8 instead

Hi,
I’m trying to recreate this tutorial in my local environment:

This is what I’ve done so far:

import os
import numpy as np
import torch
import torchvision
import torch.utils.data
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from git.repo.base import Repo
import shutil

class four_chs(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None):#, target_transform=None):
        self.root = root
        self.transforms = transforms
        #self.target_transform = target_transform
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "crop"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "mask"))))
        
    def __getitem__(self, idx):
        # load images ad masks
        img_path = os.path.join(self.root, "crop", self.imgs[idx])
        mask_path = os.path.join(self.root, "mask", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # note that we haven't converted the mask to RGB,
        # because each color corresponds to a different instance
        # with 0 being background
        mask = Image.open(mask_path)
        
        # Convert from image object to array
        mask = np.array(mask)
        
        obj_ids = np.unique(mask)
        # first is background, other values are noise, removed them
        obj_ids = obj_ids[-4:]
        
        # split the color-encoded mask into a set
        # of binary masks
        masks = mask == obj_ids[:, None, None]
        
        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        if self.transforms is not None:
            img, target = self.transforms(img, target)
        
        return img, target
        
    def __len__(self):
        return len(self.imgs)


if os.path.isdir("vision") == True:
    print("vision present")
else:
    Repo.clone_from("https://github.com/pytorch/vision.git", "vision")

shutil.copy('vision/references/detection/utils.py', 'utils.py')
shutil.copy('vision/references/detection/transforms.py', 'transforms.py')
shutil.copy('vision/references/detection/coco_eval.py', 'coco_eval.py')
shutil.copy('vision/references/detection/engine.py', 'engine.py')
shutil.copy('vision/references/detection/coco_utils.py', 'coco_utils.py')

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
 
# load a model pre-trained pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=True)
 
# replace the classifier with a new one, that has
# num_classes which is user-defined
num_classes = 5  # 1 class (person) + background
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor


def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)
    return model


from engine import train_one_epoch, evaluate
import utils
import transforms as oldtransforms
import torchvision.transforms as tvtransforms

def get_transform(train):
    transforms = []
    # converts the image, a PIL image, into a PyTorch Tensor
    transforms.append(oldtransforms.PILToTensor())
    if train:
        # during training, randomly flip the training images
        # and ground-truth for data augmentation
        transforms.append(oldtransforms.RandomHorizontalFlip(0.5))
    return oldtransforms.Compose(transforms)


model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=True)
dataset = four_chs('/home/john/Downloads', get_transform(train=True))

data_loader = torch.utils.data.DataLoader(
 dataset, batch_size=1, shuffle=True, num_workers=2,
 collate_fn=utils.collate_fn)

# For Training
images,targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images,targets)

At this point, I get this error message:

TypeError: Expected input images to be of floating type (in range [0, 1]), but found type torch.uint8 instead

This is the traceback:

Traceback (most recent call last):
File “”, line 1, in
File “/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY_/lib64/python3.9/site-packages/torch/nn/modules/module.py”, line 1130, in call_impl
return forward_call(*input, **kwargs)
File "/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY
/lib64/python3.9/site-packages/torchvision/models/detection/generalized_rcnn.py", line 83, in forward
images, targets = self.transform(images, targets)
File “/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY_/lib64/python3.9/site-packages/torch/nn/modules/module.py”, line 1130, in call_impl
return forward_call(*input, **kwargs)
File "/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY
/lib64/python3.9/site-packages/torchvision/models/detection/transform.py", line 129, in forward
image = self.normalize(image)
File “/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY_/lib64/python3.9/site-packages/torchvision/models/detection/transform.py”, line 150, in normalize
raise TypeError(

I had this error a few days ago and had tried to search for a solution. I thought I had one and that raised another problem for me, which I asked about here:

Based on that discussion, I thought I had to get my transforms straightened out. I searched some more and found several results on this forum. I tried a few different ways to use two transforms, PILToTensor and RandomHorizontalFlip.

Now, I am importing two versions of transforms. One from vision/references/detections and one from torchvision. I tried that as

import transforms as oldtransforms
import torchvision.transforms as tvtransforms

I won’t go into all the variations of trying to def get_transforms and other variations I attempted.

I read that transforming a PIL image to a tensor includes scaling the image, but that doesn’t seem to be the case here. What am I doing wrong?