How to handle different number of bounding boxes in prediction and ground truth

Hello, I am working on a computer vision project for work and am running into an issue with my code. I am using rastervision, torchvision, and pytorch-lightning. I am using a pretrained fasterrcnn_resnet5-_fpn_v2 model as my base and am training it to detect trees. My model is able to read my input data and produce an output consisting of predicted bounding boxes and predicted labels. My issue comes next, when I need to calculate a loss value for my bounding boxes. During the beginning of training, my model is predicting more bounding boxes than are truly there. For example, my ground truth images have anywhere from 0-5 boxes in each image, but during its first epoch my model predicts anywhere from 0-100 bounding boxes. I keep getting errors along the line of

RuntimeError: The size of tensor a (2) must match the size of tensor b (53) at non-singleton dimension 0

Here is my code:

import albumentations as A
from rastervision.core.data import ClassConfig
from rastervision.pytorch_learner import (
    ObjectDetectionRandomWindowGeoDataset,
    ObjectDetectionSlidingWindowGeoDataset,
)
from torch.utils.data import DataLoader
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from rastervision.pipeline.file_system import make_dir
from pytorch_lightning.loggers import TensorBoardLogger
from rastervision.pytorch_learner.object_detection_utils import TorchVisionODAdapter
from rastervision.pytorch_learner.object_detection_utils import collate_fn as TVODcollate_fn
from torchvision.ops import generalized_box_iou_loss

def exit():
    import sys
    sys.exit(0)

class ObjectDetection(pl.LightningModule):

    def __init__(self, backbone, lr=1e-4):
        super().__init__()
        self.backbone = TorchVisionODAdapter(backbone)
        self.lr = lr

    def forward(self, img):
        return self.backbone(img)
    
    def boxlist_to_tensor(self, bl):
        # convert list of boxlists into box and label tensors of shape (N,4) and (N)
        bl = [self.backbone.boxlist_to_model_input_dict(b) for b in bl]
        boxes = [b["boxes"] for b in bl]
        labels = [b["labels"] for b in bl]
        boxes = torch.vstack(boxes)
        labels = torch.concat(labels).float()
        return boxes, labels
    
    def training_step(self, batch, batch_idx):
        print("Sanity training")
        x, y = batch
        y_hat = self.backbone.forward(x)
        box_hat, label_hat = self.boxlist_to_tensor(y_hat)
        box, label = self.boxlist_to_tensor(y)
        lossfn = torch.nn.MSELoss()
        box_loss = lossfn(box, box_hat)
        return box_loss
    
    def validation_step(self, batch, batch_idx):
        print("Sanity validation")
        x, y = batch
        y_hat = self.backbone.forward(x)
        box_hat, label_hat = self.boxlist_to_tensor(y_hat)
        box, label = self.boxlist_to_tensor(y)
        print(label_hat.shape, label.shape)
        lossfn = torch.nn.MSELoss()
        box_loss = lossfn(box, box_hat)
        return box_loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.backbone.parameters(), lr=self.lr)
        return optimizer
    
class RVLightning:
    
    def __init__(self, tr_uris, val_uris, pred_uris, output, class_config, kw=None):
        self.train_uris = tr_uris
        self.val_uris = val_uris
        self.pred_uris = pred_uris
        self.cc = ClassConfig(
            names=class_config["names"], 
            colors=class_config["colors"],
            null_class="null")
        self.output_uri = output.get("uri")
        self.bucket = output.get("bucket")
        self.kw = kw
        
    def build_train_ds(self):
        ... build and return training dataset ...
    
    def build_val_ds(self):
        ... build and return validation dataset ...
    
    def build_train_val_loader(self):
        tds, vds = self.build_train_ds(), self.build_val_ds()
        kw = self.kw.get("train_kw", {})
        batch_size = kw.get("batch_size", 8)
        train = DataLoader(tds, batch_size=batch_size, shuffle=True, 
                           num_workers=4, collate_fn=TVODcollate_fn)
        val = DataLoader(vds, batch_size=batch_size, 
                         num_workers=4, collate_fn=TVODcollate_fn)
        return train, val
    
    def train(self):
        kw = self.kw.get("train_kw", {})
        lr = float(kw.get("lr", 1e-4))
        epochs = kw.get("epochs", 1)
        output_dir = self.output_uri
        make_dir(output_dir)
        fast_dev_run = False
        backbone = fasterrcnn_resnet50_fpn_v2(
            num_classes=len(self.cc), pretrained=True)
        model = ObjectDetection(backbone, lr=lr)
        tb_logger = TensorBoardLogger(save_dir=output_dir + "/tensorboard", flush_secs=10)
        trainer = pl.Trainer(
            accelerator='auto',
            min_epochs=1,
            max_epochs=epochs+1,
            default_root_dir=output_dir + "/trainer",
            logger=[tb_logger],
            fast_dev_run=fast_dev_run,
            log_every_n_steps=1,
        )
        train_dl, val_dl = self.build_train_val_loader()
        trainer.fit(model, train_dl, val_dl)
        trainer.save_checkpoint(output_dir + "/trainer/final-model.ckpt")

def run(config_path):
    from configreader import yaml2dict

    conf = yaml2dict(config_path)

    obj = RVLightning(
        conf["train_uri"],
        conf["val_uri"],
        conf["pred_uri"],
        conf["output"],
        conf["class_config"],
        conf
    )

    obj.train()

if __name__ == "__main__":
    import sys
    run(sys.argv[1])

What can I do to compare the output of my model against my ground truth?