Hi,
I’m trying to recreate this tutorial in my local environment:
This is what I’ve done so far:
import os
import numpy as np
import torch
import torchvision
import torch.utils.data
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from git.repo.base import Repo
import shutil
class four_chs(torch.utils.data.Dataset):
def __init__(self, root, transforms=None):#, target_transform=None):
self.root = root
self.transforms = transforms
#self.target_transform = target_transform
# load all image files, sorting them to
# ensure that they are aligned
self.imgs = list(sorted(os.listdir(os.path.join(root, "crop"))))
self.masks = list(sorted(os.listdir(os.path.join(root, "mask"))))
def __getitem__(self, idx):
# load images ad masks
img_path = os.path.join(self.root, "crop", self.imgs[idx])
mask_path = os.path.join(self.root, "mask", self.masks[idx])
img = Image.open(img_path).convert("RGB")
# note that we haven't converted the mask to RGB,
# because each color corresponds to a different instance
# with 0 being background
mask = Image.open(mask_path)
# Convert from image object to array
mask = np.array(mask)
obj_ids = np.unique(mask)
# first is background, other values are noise, removed them
obj_ids = obj_ids[-4:]
# split the color-encoded mask into a set
# of binary masks
masks = mask == obj_ids[:, None, None]
# get bounding box coordinates for each mask
num_objs = len(obj_ids)
boxes = []
for i in range(num_objs):
pos = np.where(masks[i])
xmin = np.min(pos[1])
xmax = np.max(pos[1])
ymin = np.min(pos[0])
ymax = np.max(pos[0])
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# there is only one class
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor(masks, dtype=torch.uint8)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)
if os.path.isdir("vision") == True:
print("vision present")
else:
Repo.clone_from("https://github.com/pytorch/vision.git", "vision")
shutil.copy('vision/references/detection/utils.py', 'utils.py')
shutil.copy('vision/references/detection/transforms.py', 'transforms.py')
shutil.copy('vision/references/detection/coco_eval.py', 'coco_eval.py')
shutil.copy('vision/references/detection/engine.py', 'engine.py')
shutil.copy('vision/references/detection/coco_utils.py', 'coco_utils.py')
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
# load a model pre-trained pre-trained on COCO
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=True)
# replace the classifier with a new one, that has
# num_classes which is user-defined
num_classes = 5 # 1 class (person) + background
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
def get_model_instance_segmentation(num_classes):
# load an instance segmentation model pre-trained pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# now get the number of input features for the mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
num_classes)
return model
from engine import train_one_epoch, evaluate
import utils
import transforms as oldtransforms
import torchvision.transforms as tvtransforms
def get_transform(train):
transforms = []
# converts the image, a PIL image, into a PyTorch Tensor
transforms.append(oldtransforms.PILToTensor())
if train:
# during training, randomly flip the training images
# and ground-truth for data augmentation
transforms.append(oldtransforms.RandomHorizontalFlip(0.5))
return oldtransforms.Compose(transforms)
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=True)
dataset = four_chs('/home/john/Downloads', get_transform(train=True))
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=1, shuffle=True, num_workers=2,
collate_fn=utils.collate_fn)
# For Training
images,targets = next(iter(data_loader))
images = list(image for image in images)
targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images,targets)
At this point, I get this error message:
TypeError: Expected input images to be of floating type (in range [0, 1]), but found type torch.uint8 instead
This is the traceback:
Traceback (most recent call last):
File “”, line 1, in
File “/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY_/lib64/python3.9/site-packages/torch/nn/modules/module.py”, line 1130, in call_impl
return forward_call(*input, **kwargs)
File "/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY/lib64/python3.9/site-packages/torchvision/models/detection/generalized_rcnn.py", line 83, in forward
images, targets = self.transform(images, targets)
File “/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY_/lib64/python3.9/site-packages/torch/nn/modules/module.py”, line 1130, in call_impl
return forward_call(*input, **kwargs)
File "/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY/lib64/python3.9/site-packages/torchvision/models/detection/transform.py", line 129, in forward
image = self.normalize(image)
File “/home/nightjar/.local/share/virtualenvs/instance_seg_training-z1i-LkY_/lib64/python3.9/site-packages/torchvision/models/detection/transform.py”, line 150, in normalize
raise TypeError(
I had this error a few days ago and had tried to search for a solution. I thought I had one and that raised another problem for me, which I asked about here:
Based on that discussion, I thought I had to get my transforms straightened out. I searched some more and found several results on this forum. I tried a few different ways to use two transforms, PILToTensor and RandomHorizontalFlip.
Now, I am importing two versions of transforms. One from vision/references/detections and one from torchvision. I tried that as
import transforms as oldtransforms
import torchvision.transforms as tvtransforms
I won’t go into all the variations of trying to def get_transforms
and other variations I attempted.
I read that transforming a PIL image to a tensor includes scaling the image, but that doesn’t seem to be the case here. What am I doing wrong?