Different sized images between training and inference a problem?

I am wanting to ask… if the tutorial outlined for FasterRCNN here : https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html

Does it deal well with new image sizes for inference?

Reason I ask is… I have instantiated a model or two… and was unable to use the provided COCOEvaluator code… so figured to in the meantime peak into the boxes, labels, and scores myself if I ran the modell… but the boxes I am getting out of the prediction are very very oddly the same, and very small

is there a transform step I need to be doing… to upscale the box coords ? Or is my model just very poor and I need to train it better ?

Here is where I load the coco image

    def __getitem__(self, item):

        (pil_image, targets) = super(CustomCocoDataset, self).__getitem__(item)

        # get bounding box coordinates for each mask
        num_targets = len(targets)
        boxes = []
        for i in range(num_targets):
            box = targets[i]["bbox"]
            xmin = box[0]
            xmax = box[0] + box[2]
            ymin = box[1]
            ymax = box[1] + box[3]
            boxes.append([xmin, ymin, xmax, ymax])

        # convert everything into a torch.Tensor, unless you're performing testing (ie COCOEval)

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.ones((num_targets,), dtype=torch.int64)
        image_id = torch.tensor([item])
        areas = []
        for i in range(num_targets):
            areas.append(targets[i]["area"])
        areas = torch.as_tensor(areas, dtype=torch.float32)
        iscrowd = torch.zeros((num_targets,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = areas
        target["iscrowd"] = iscrowd

        image = F.to_tensor(pil_image)

        return image, target

And here is what I do to render the box + label + score…

@torch.no_grad()
def exec_evaluate(data_conf, model_conf, model, data_loader, device):
    print("beginning actual eval")
    model.eval()

    print("entering image target loop")
    for images, targets in data_loader:
        images = list(img.to(device) for img in images)

        outputs = model(images)
        print("received outputs")
        outputs = [{k: v.to(device) for k, v in t.items()} for t in outputs]

        if model_conf["hyperParameters"]["testing"]["enable_visualization"]:
            visualize_result(data_conf=data_conf, 
                             model_conf=model_conf,
                             images=images, 
                             predictions=outputs)

    print("exited image loop")

inside visualize_result

def visualize_result(data_conf, model_conf, images, predictions):

    assert(len(images) == len(predictions))

    for i in range(len(images)):
        print("image tensor size is :" + str(images[i].size()))
        binary_image = transforms.ToPILImage()(images[i]).convert("RGB")
        tagged_binary_image = vis_detections(binary_image, data_conf["classes_available"], predictions[i])
        # output tagged image to disk

        print("done with tagged image")


def vis_detections(im, class_names, predictions, thresh=0.8):

    print(str(predictions))
    """Visual debugging of detections."""
    for item in range(len(predictions)):
        bboxes = predictions["boxes"]
        labels = predictions["labels"]
        scores = predictions["scores"]

        assert len(bboxes) == len(labels) == len(scores)

        index = 0
        for box in bboxes.numpy():
            print("assessing score " + str(scores[index]))
            if scores[index] > .5:
                ImageDraw.ImageDraw(im=im).rectangle(xy=box, outline="yellow", width=1)
                ImageDraw.ImageDraw(im=im).text(xy=[box[0], box[1] + 15],
                                                text=class_names[index] + "@" + str(scores[index]),
                                                fill="blue")
                index += 1
        im.show()
    return im

Training was performed on 4K images… and the test data on inference can range anywhere from 4K to 1080p… but my first run of the inference shows two very very tiny boxes

example :

image tensor size is :torch.Size([3, 1080, 1920])
{'boxes': tensor([[128.3801, 203.8880, 215.1749, 215.0131],
        [148.5580, 205.5074, 194.2489, 213.1967]]), 'labels': tensor([1, 1]), 'scores': tensor([0.9231, 0.9231])}