Noisy, Rectangular Prediction for Semantic Segmentation

I am trying to make a semantic segmentation model for my own dataset using Pytorch Documentation (Pytorch Segmentation). I ran the model using the Penn Dataset from the tutorial and everything functions correctly. I annotated 170 images to find a solid circle in an image. I annotated the images to find the circle using Labelme. I preprocessed the images and created masks with pixel values equal to their id in the image (1 for first circle in image, 2 for second circle in image, etc). I have the same directory structure as the Penn Dataset, but when I run the model on my data, I get a noisy, rectangular prediction as seen in the second image. It seems like the model is somewhat close to being successful. Any ideas on how to annotate the images correctly, or preprocess them correctly?

import os
import numpy as np 
import torch 
from PIL import Image
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
import transforms as T 
import utils
from engine import train_one_epoch, evaluate

class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        self.imgs = list(sorted(os.listdir(os.path.join(root, 'CircleImages'))))
        self.masks = list(sorted(os.listdir(os.path.join(root, 'CircleMasks'))))

    def __getitem__(self, idx):
        img_path = os.path.join(self.root, 'CircleImages', self.imgs[idx])
        mask_path = os.path.join(self.root, 'CircleMasks', self.masks[idx])
        img = Image.open(img_path).convert('RGB')
        mask = Image.open(mask_path)
        mask = np.array(mask)
        obj_ids = np.unique(mask)
        obj_ids = obj_ids[1:]
        masks = mask == obj_ids[:, None, None]
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        image_id = torch.tensor([idx])
        area = (boxes[:,3] - boxes[:,1]) * (boxes[:,2] - boxes[:,0])
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        target['masks'] = masks
        target['image_id'] = image_id
        target['area'] = area
        target['iscrowd'] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)
        return img, target
    
    def __len__(self):
        return len(self.imgs)

def get_model_instance_segmentation(num_classes):
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layers = 256
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layers, num_classes)

    return model

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    
    return  T.Compose(transforms)

dataset = PennFudanDataset('CircleDataset', get_transform(train=True))
dataset_test = PennFudanDataset('CircleDataset', get_transform(train=False))

torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0, collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=0, collate_fn=utils.collate_fn)

device = torch.device('cpu')
num_classes = 2
model = get_model_instance_segmentation(num_classes)
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

num_epochs = 1
for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    lr_scheduler.step()
    evaluate(model, data_loader_test, device=device)
img, _ = dataset_test[0]
model.eval()
with torch.no_grad():
    prediction = model([img.to(device)])
print(prediction)
x = Image.fromarray(img.mul(255).permute(1,2,0).byte().numpy())
x.show()
mask = Image.fromarray(prediction[0]['masks'][0,0].mul(255).byte().cpu().numpy())
mask.show()

Solution: The way I labeled the images and masks caused them to be mismatched when calling sorted(images) and sorted(masks). So the images were being labeled with the wrong mask. Fixing this gave me accurate predictions.