Mask r-cnn training runs infinitely without output or error

Gladius_monta · May 5, 2024, 10:07am

Here’s a brief overview of my process:

I generated a dataset using PyTorch by applying the SAM mask from bounding boxes to my images.
After creating the dataset, I split it into training and testing sets.
I loaded both sets using torch.utils.data.DataLoader.
I’m using a pre-trained model with 11 classes.

However, I’m encountering an issue during training. The process seems to take an unusually long time, and I’m not seeing any progress or error messages to troubleshoot from.

What might be going wrong or how to improve my training process?

ptrblck · May 5, 2024, 4:55pm

At the bottom of the notebook it states: “Output is truncated. View as…”. Are you sure the code is stuck or is your notebook not showing the outputs according to this message?

Gladius_monta · May 5, 2024, 5:16pm

I modified the code and I tried on google colab
in this cell I got different error

First error

Second error

SystemExit Traceback (most recent call last)
in <cell line: 3>()
6 num_epochs = 10
7 for epoch in range(num_epochs):
----> 8 train_one_epoch(model_ft, optimizer, data_loader_train, device, epoch, print_freq=100)
9 lr_scheduler.step()
10 evaluate(model, data_loader_test, device=device)

/content/engine.py in train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, scaler)
41 print(f"Loss is {loss_value}, stopping training")
42 print(loss_dict_reduced)
—> 43 sys.exit(1)
44
45 optimizer.zero_grad()

SystemExit: 1

ptrblck · May 5, 2024, 5:27pm

If you are using an older PyTorch release, could you update to the latest stable release and check if you are still seeing the error?
If not, could you post a minimal and executable code snippet reproducing the issue?

Gladius_monta · May 5, 2024, 5:35pm

I used PyTorch version 2.3.0, other files like engine.py utils.py… get from github
I don’t know if the problem in the form of the dataset that train MaskRCNN or what?
but I think that the structure of dataset is good

this is my dataset.py thar generate the data

# dataset.py

import torch
import torch.utils.data
import torchvision.transforms as transforms
from PIL import Image
import os
import numpy as np

class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, grouped_df, df, image_folder, processor, model, device): 
        self.df = df
        self.image_folder = image_folder
        self.grouped_df = grouped_df
        self.processor = processor
        self.model = model
        self.device = device
        
        formatted_array = []
        for num in df["imageid"].unique():
            num_str = str(num)
            if len(num_str) == 1:
                formatted_array.append("00" + num_str)
            elif len(num_str) == 2:
                formatted_array.append("0" + num_str)
            else:
                formatted_array.append(num_str)
        self.image_ids = formatted_array
        self.classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        
        self.masks = []
        for idx in range(len(self.image_ids)):
            image_name = self.image_ids[idx]
            image_path = os.path.join(self.image_folder, image_name + '.jpg')
            image = Image.open(image_path)
            resized_image = image.resize((350, 350))
            resized_array = np.array(resized_image).astype(np.uint8)
            
            # SAM to find mask
            inputs = self.processor(resized_array, return_tensors="pt").to(self.device)
            image_embeddings = self.model.get_image_embeddings(inputs["pixel_values"])
            inputs.pop("pixel_values", None)
            inputs.update({"image_embeddings": image_embeddings})
            with torch.no_grad():
                outputs = self.model(**inputs, multimask_output=False)
            masks = self.processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
            masks = torch.as_tensor(np.array(list(map(np.array, masks)), dtype=np.uint8))
            masks = masks.squeeze(dim=0).squeeze(dim=1)
            self.masks.append(masks)
        
    def __getitem__(self, idx):
        image_name = self.image_ids[idx]
        image_path = os.path.join(self.image_folder, image_name + '.jpg')
        image = Image.open(image_path)
        image_id = torch.tensor([idx])
        # Resize image
        resized_image = image.resize((350, 350))
        resized_array = np.array(resized_image).astype(np.uint8)
        
        # Get classes
        labels = torch.as_tensor(self.grouped_df["classid"][idx], dtype=torch.int64)
        
        # Get boxes
        boxes = torch.as_tensor(self.grouped_df["resized_bbox"][idx], dtype=torch.float32)
        
        # Calculate the area
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        
        # Get mask
        masks = self.masks[idx]
        
        # Suppose all instances are not crowd
        iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)
        
        # Create target
        target = {}
        target["iscrowd"] = iscrowd
        target['boxes'] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["area"] = area
        target["image_id"] = image_id
        return transforms.ToTensor()(resized_array), target
    
    def __len__(self):
        return len(self.image_ids)

I save it as .pkl then loaded to split the data as you can see here

from torch.utils.data import Subset, DataLoader

torch.manual_seed(1)
indices = torch.randperm(len(loaded_dataset)).tolist()

test_split = 0.2
size = int(len(loaded_dataset) * test_split)
dataset_train = Subset(loaded_dataset, indices[:-size])
dataset_test = Subset(loaded_dataset, indices[-size:])

data_loader_train = DataLoader(
    dataset_train, batch_size=4, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

data_loader_test = DataLoader(
    dataset_test, batch_size=4, shuffle=False, num_workers=4,
    collate_fn=utils.collate_fn)

load MaskRCNN model and train it


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

num_classes = 11

model_ft = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

in_features = model_ft.roi_heads.box_predictor.cls_score.in_features

model_ft.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

in_features_mask = model_ft.roi_heads.mask_predictor.conv5_mask.in_channels

hidden_layer = 256

model_ft.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, num_classes)

model_ft.to(device)
params = [p for p in model_ft.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.0005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=5,
                                               gamma=0.1)

import torch.multiprocessing as mp
if __name__ == "__main__":
    
    mp.set_start_method('spawn')
    num_epochs = 10
    for epoch in range(num_epochs):
        train_one_epoch(model_ft, optimizer, data_loader_train, device, epoch, print_freq=100)
        lr_scheduler.step()
        evaluate(model, data_loader_test, device=device)