How do I start training an object detection model in PyTorch

I want to train a custom object detection model in PyTorch. I am creating a CustomDataSet class for loading my dataset. My code to create the data is as follows

class CustomDataset(
    def __init__(self, root_dir,transform=None):
        self.root = root_dir
        self.imgs = list(sorted(os.listdir(os.path.join(root_dir, "images/"))))
        self.annotations = list(sorted(os.listdir(os.path.join(root_dir, "annotations/"))))

        self._classes = ('__background__',  # always index 0

        self._class_to_ind = {'car':'3', 'person':'1', 'bicycle':'2', 'dog':'18','other':'91'}

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        self.num_classes = 6
        img = os.path.join(self.root, "images/", self.rgb_imgs[idx])
        img =
        img = np.array(img)
        img = torch.from_numpy(img)

        filename = os.path.join(self.root,'annotations',self.annotations[idx])
        tree = ET.parse(filename)
        objs = tree.findall('object')

        num_objs = len(objs)
        boxes = np.zeros((num_objs, 4), dtype=np.uint16)
        labels = np.zeros((num_objs), dtype=np.float32)
        seg_areas = np.zeros((num_objs), dtype=np.float32)

        boxes = []
        for ix, obj in enumerate(objs):
            bbox = obj.find('bndbox')
            x1 = float(bbox.find('xmin').text)
            y1 = float(bbox.find('ymin').text)
            x2 = float(bbox.find('xmax').text)
            y2 = float(bbox.find('ymax').text)

            cls = self._class_to_ind[obj.find('name').text.lower().strip()]
            boxes.append([x1, y1, x2, y2])
            labels[ix] = cls
            seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        seg_areas = torch.as_tensor(seg_areas, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.float32)
        target =  {'boxes': boxes,
                'labels': labels,
                'seg_areas': seg_areas,

        return img,target

My main function to start training is as follows

num_classes = 6
model = fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

dataset_train = CustomDataset('images/train/')
dataset_val = CustomDataset('images/val/')

data_loader_train =
    dataset_train, batch_size=1, shuffle=True)

data_loader_test =
    dataset_val, batch_size=1, shuffle=False)

device = torch.device('cuda')

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
                            momentum=0.9, weight_decay=0.0005)

lr_scheduler = torch.optim.lr_schedler.StepLR(optimizer,

num_epochs = 10

for epoch in range(num_epochs):

    # Code to start training

I have defined my model, created a DataLoader for my training and validation set, but I am not sure how to start training and how exactly I should provide inputs to my model.
I am a beginner in PyTorch and it would be great if someone could help me out

1 Like

It looks like you’ve already followed this tutorial, so I would recommend to stick to it and use the provided train_one_epoch method. :wink:

Thanks, I actually did follow that. But I stopped because I was not quite sure what the collate function was doing from the file. I could not quite understand the reason it was used while loading the data. Could you please explain what it does and why it is needed?