I am trying to make a semantic segmentation model for my own dataset using Pytorch Documentation (Pytorch Segmentation). I ran the model using the Penn Dataset from the tutorial and everything functions correctly. I annotated 170 images to find a solid circle in an image. I annotated the images to find the circle using Labelme. I preprocessed the images and created masks with pixel values equal to their id in the image (1 for first circle in image, 2 for second circle in image, etc). I have the same directory structure as the Penn Dataset, but when I run the model on my data, I get a noisy, rectangular prediction as seen in the second image. It seems like the model is somewhat close to being successful. Any ideas on how to annotate the images correctly, or preprocess them correctly?
import os
import numpy as np
import torch
from PIL import Image
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
import transforms as T
import utils
from engine import train_one_epoch, evaluate
class PennFudanDataset(object):
def __init__(self, root, transforms):
self.root = root
self.transforms = transforms
self.imgs = list(sorted(os.listdir(os.path.join(root, 'CircleImages'))))
self.masks = list(sorted(os.listdir(os.path.join(root, 'CircleMasks'))))
def __getitem__(self, idx):
img_path = os.path.join(self.root, 'CircleImages', self.imgs[idx])
mask_path = os.path.join(self.root, 'CircleMasks', self.masks[idx])
img = Image.open(img_path).convert('RGB')
mask = Image.open(mask_path)
mask = np.array(mask)
obj_ids = np.unique(mask)
obj_ids = obj_ids[1:]
masks = mask == obj_ids[:, None, None]
num_objs = len(obj_ids)
boxes = []
for i in range(num_objs):
pos = np.where(masks[i])
xmin = np.min(pos[1])
xmax = np.max(pos[1])
ymin = np.min(pos[0])
ymax = np.max(pos[0])
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.as_tensor(boxes, dtype=torch.float32)
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor(masks, dtype=torch.uint8)
image_id = torch.tensor([idx])
area = (boxes[:,3] - boxes[:,1]) * (boxes[:,2] - boxes[:,0])
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
target = {}
target['boxes'] = boxes
target['labels'] = labels
target['masks'] = masks
target['image_id'] = image_id
target['area'] = area
target['iscrowd'] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)
def get_model_instance_segmentation(num_classes):
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layers = 256
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layers, num_classes)
return model
def get_transform(train):
transforms = []
transforms.append(T.ToTensor())
if train:
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
dataset = PennFudanDataset('CircleDataset', get_transform(train=True))
dataset_test = PennFudanDataset('CircleDataset', get_transform(train=False))
torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0, collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn)
device = torch.device('cpu')
num_classes = 2
model = get_model_instance_segmentation(num_classes)
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
num_epochs = 1
for epoch in range(num_epochs):
train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
lr_scheduler.step()
evaluate(model, data_loader_test, device=device)
img, _ = dataset_test[0]
model.eval()
with torch.no_grad():
prediction = model([img.to(device)])
print(prediction)
x = Image.fromarray(img.mul(255).permute(1,2,0).byte().numpy())
x.show()
mask = Image.fromarray(prediction[0]['masks'][0,0].mul(255).byte().cpu().numpy())
mask.show()