How do I avoid downsampling with Faster RCNN + ResNet backbone?

I previously was performing Faster R CNN via a project without using torchvision… however I want to give it a try to port not only to torchvision but also pytorch 1.5…

So far I can successfully train a model of Faster RCNN coupled to a Resnet101 backbone… but when I train I can see I am not utilizing the full GPU VRAM (6GBs) … only about 3.4GBs. My images are over 4K in size, and I would guess this is an indicator of downsampling… yes ? I have instantiated the backbone resnet 101 with only pretrained=True like so…

import torchvision
from torch import nn

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator


class ModelResnet101FasterRCNN(FasterRCNN):
    def __init__(self, data_conf, model_conf):
        # load a model pre-trained pre-trained on COCO
        backbone_nn = torchvision.models.resnet101(pretrained=True)

        modules = list(backbone_nn.children())[:-1]  # delete the last fc layer.
        backbone_nn = nn.Sequential(*modules)
        for param in backbone_nn.parameters():
            param.requires_grad = False

        # FasterRCNN needs to know the number of
        # output channels in a backbone. For resnet101, it's 2048
        backbone_nn.out_channels = 2048

        # let's make the RPN generate 5 x 3 anchors per spatial
        # location, with 5 different sizes and 3 different aspect
        # ratios. We have a Tuple[Tuple[int]] because each feature
        # map could potentially have different sizes and
        # aspect ratios

        anchor_ratios = model_conf["hyperParameters"]["anchor_ratios"]
        anchor_sizes = model_conf["hyperParameters"]["anchor_scales"]

        anchor_generator = AnchorGenerator(sizes=(anchor_sizes,),
                                           aspect_ratios=(anchor_ratios,))

        # let's define what are the feature maps that we will
        # use to perform the region of interest cropping, as well as
        # the size of the crop after rescaling.
        # if your backbone returns a Tensor, featmap_names is expected to
        # be [0]. More generally, the backbone should return an
        # OrderedDict[Tensor], and in featmap_names you can choose which
        # feature maps to use.

        rpn_pooling_size = model_conf["hyperParameters"]["pooling_size"]

        roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=["0"],
                                                        output_size=rpn_pooling_size,
                                                        sampling_ratio=2)

        # put the pieces together inside a FasterRCNN model
        super().__init__(backbone=backbone_nn,
                         num_classes=len(data_conf["classes_available"]),
                         image_mean=model_conf["hyperParameters"]["normalization_mean"],
                         image_std=model_conf["hyperParameters"]["normalization_std"],
                         rpn_anchor_generator=anchor_generator,
                         box_roi_pool=roi_pooler,
                         rpn_pre_nms_top_n_train=model_conf["hyperParameters"]["rpn_pre_nms_top_n_train"],
                         rpn_post_nms_top_n_train=model_conf["hyperParameters"]["rpn_post_nms_top_n_train"],
                         rpn_nms_thresh=model_conf["hyperParameters"]["rpn_nms_thresh"],
                         max_size=model_conf["hyperParameters"]["max_size_image"])

Nvidia-smi output

Mon May  4 23:14:35 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64       Driver Version: 440.64       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 106...  Off  | 00000000:01:00.0 Off |                  N/A |
| 31%   60C    P2    73W / 200W |   3835MiB /  6078MiB |     64%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0    659060      C   ...p/anaconda3/envs/pytorch_150/bin/python  3825MiB |
+-----------------------------------------------------------------------------+

Looking at the ResNet implementation… it seems I need to override the Botteneck area somehow ?

class Bottleneck(nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()

I now have figured out what was going on…

I have been freezing the resnet pretrained model… and I do not want to do that… so I have now tried to make it switchable in the instantation of the model object… but I am having trouble…

looking at the transfer learning tutorial for help… but my shapes is not quite working yet

found 5 categories in data
Creating model backbone with wide_resnet101_2
Finetuning your backbone... in_features = 2048
/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/nn/functional.py:2854: UserWarning: The default behavior for interpolate/upsample with float scale_factor will change in 1.6.0 to align with other frameworks/libraries, and use scale_factor directly, instead of relying on the computed output size. If you wish to keep the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. 
  warnings.warn("The default behavior for interpolate/upsample with float scale_factor will change "
Traceback (most recent call last):
  File "~/model_components/training.py", line 124, in <module>
    train(data_conf=config_json, model_conf=model_conf)
  File "~/model_components/training.py", line 88, in train
    print_freq=model_conf["hyperParameters"]["display_interval"])
  File "~/model_components/references/detection/engine.py", line 33, in train_one_epoch
    loss_dict = model(images, targets)
  File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torchvision/models/detection/generalized_rcnn.py", line 70, in forward
    proposals, proposal_losses = self.rpn(images, features, targets)
  File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torchvision/models/detection/rpn.py", line 475, in forward
    objectness, pred_bbox_deltas = self.head(features)
  File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torchvision/models/detection/rpn.py", line 207, in forward
    t = F.relu(self.conv(feature))
  File "/home/emcp/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "~/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 349, in forward
    return self._conv_forward(input, self.weight)
  File "~/anaconda3/envs/pytorch_150/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 346, in _conv_forward
    self.padding, self.dilation, self.groups)
RuntimeError: Expected 4-dimensional input for 4-dimensional weight [5, 5, 3, 3], but got 2-dimensional input of size [1, 5] instead

Process finished with exit code 1

I see in the transfer learning there is use of a nn.CrossEntropyLoss() … do I need that ? sort of lost how to get out of using the frozen gradients

import torchvision
from torch import nn

from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator


class ModelResnet101FasterRCNN(FasterRCNN):
    def __init__(self, data_conf, model_conf):

        print("Creating model backbone with " + str(model_conf["hyperParameters"]["net"]))
        backbone_nn = torchvision.models.__dict__[model_conf["hyperParameters"]["net"]](pretrained=True)

        if model_conf["hyperParameters"]["freeze_pretrained_gradients"]:
            print("Using backbone as fixed feature extractor")
            modules = list(backbone_nn.children())[:-1]  # delete the last fc layer.
            backbone_nn = nn.Sequential(*modules)

            # FasterRCNN needs to know the number of
            # output channels in a backbone. For resnet101, it's 2048
            for param in backbone_nn.parameters():
                param.requires_grad = False
            backbone_nn.out_channels = 2048
        else:
            print("Finetuning your backbone... in_features = " + str(backbone_nn.fc.in_features))
            num_ftrs = backbone_nn.fc.in_features
            backbone_nn.fc = nn.Linear(in_features=num_ftrs, out_features=len(data_conf["classes_available"]))
            backbone_nn.out_channels = 5
        #

        # let's make the RPN generate 5 x 3 anchors per spatial
        # location, with 5 different sizes and 3 different aspect
        # ratios. We have a Tuple[Tuple[int]] because each feature
        # map could potentially have different sizes and
        # aspect ratios

        anchor_ratios = model_conf["hyperParameters"]["anchor_ratios"]
        anchor_sizes = model_conf["hyperParameters"]["anchor_scales"]

        anchor_generator = AnchorGenerator(sizes=(anchor_sizes,),
                                           aspect_ratios=(anchor_ratios,))

        # let's define what are the feature maps that we will
        # use to perform the region of interest cropping, as well as
        # the size of the crop after rescaling.
        # if your backbone returns a Tensor, featmap_names is expected to
        # be [0]. More generally, the backbone should return an
        # OrderedDict[Tensor], and in featmap_names you can choose which
        # feature maps to use.

        rpn_pooling_size = model_conf["hyperParameters"]["pooling_size"]

        roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=["0"],
                                                        output_size=rpn_pooling_size,
                                                        sampling_ratio=2)

        # put the pieces together inside a FasterRCNN model
        super().__init__(backbone=backbone_nn,
                         num_classes=len(data_conf["classes_available"]),
                         image_mean=model_conf["hyperParameters"]["normalization_mean"],
                         image_std=model_conf["hyperParameters"]["normalization_std"],
                         rpn_anchor_generator=anchor_generator,
                         box_roi_pool=roi_pooler,
                         rpn_pre_nms_top_n_train=model_conf["hyperParameters"]["rpn_pre_nms_top_n_train"],
                         rpn_post_nms_top_n_train=model_conf["hyperParameters"]["rpn_post_nms_top_n_train"],
                         rpn_nms_thresh=model_conf["hyperParameters"]["rpn_nms_thresh"],
                         min_size=model_conf["hyperParameters"]["min_size_image"],
                         max_size=model_conf["hyperParameters"]["max_size_image"])