I previously was performing Faster R CNN via a project without using torchvision… however I want to give it a try to port not only to torchvision but also pytorch 1.5…
So far I can successfully train a model of Faster RCNN coupled to a Resnet101 backbone… but when I train I can see I am not utilizing the full GPU VRAM (6GBs) … only about 3.4GBs. My images are over 4K in size, and I would guess this is an indicator of downsampling… yes ? I have instantiated the backbone resnet 101 with only pretrained=True
like so…
import torchvision
from torch import nn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor, FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
class ModelResnet101FasterRCNN(FasterRCNN):
def __init__(self, data_conf, model_conf):
# load a model pre-trained pre-trained on COCO
backbone_nn = torchvision.models.resnet101(pretrained=True)
modules = list(backbone_nn.children())[:-1] # delete the last fc layer.
backbone_nn = nn.Sequential(*modules)
for param in backbone_nn.parameters():
param.requires_grad = False
# FasterRCNN needs to know the number of
# output channels in a backbone. For resnet101, it's 2048
backbone_nn.out_channels = 2048
# let's make the RPN generate 5 x 3 anchors per spatial
# location, with 5 different sizes and 3 different aspect
# ratios. We have a Tuple[Tuple[int]] because each feature
# map could potentially have different sizes and
# aspect ratios
anchor_ratios = model_conf["hyperParameters"]["anchor_ratios"]
anchor_sizes = model_conf["hyperParameters"]["anchor_scales"]
anchor_generator = AnchorGenerator(sizes=(anchor_sizes,),
aspect_ratios=(anchor_ratios,))
# let's define what are the feature maps that we will
# use to perform the region of interest cropping, as well as
# the size of the crop after rescaling.
# if your backbone returns a Tensor, featmap_names is expected to
# be [0]. More generally, the backbone should return an
# OrderedDict[Tensor], and in featmap_names you can choose which
# feature maps to use.
rpn_pooling_size = model_conf["hyperParameters"]["pooling_size"]
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=["0"],
output_size=rpn_pooling_size,
sampling_ratio=2)
# put the pieces together inside a FasterRCNN model
super().__init__(backbone=backbone_nn,
num_classes=len(data_conf["classes_available"]),
image_mean=model_conf["hyperParameters"]["normalization_mean"],
image_std=model_conf["hyperParameters"]["normalization_std"],
rpn_anchor_generator=anchor_generator,
box_roi_pool=roi_pooler,
rpn_pre_nms_top_n_train=model_conf["hyperParameters"]["rpn_pre_nms_top_n_train"],
rpn_post_nms_top_n_train=model_conf["hyperParameters"]["rpn_post_nms_top_n_train"],
rpn_nms_thresh=model_conf["hyperParameters"]["rpn_nms_thresh"],
max_size=model_conf["hyperParameters"]["max_size_image"])
Nvidia-smi output
Mon May 4 23:14:35 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64 Driver Version: 440.64 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 106... Off | 00000000:01:00.0 Off | N/A |
| 31% 60C P2 73W / 200W | 3835MiB / 6078MiB | 64% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 659060 C ...p/anaconda3/envs/pytorch_150/bin/python 3825MiB |
+-----------------------------------------------------------------------------+
Looking at the ResNet implementation… it seems I need to override the Botteneck area somehow ?
class Bottleneck(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
super(Bottleneck, self).__init__()