I am trying to train a Faster RCNN Network on a custom dataset consisting of images for object detection. However, I don’t want to directly give an RGB image as input, I actually need to pass it through another network (a feature extractor) along with the corresponding thermal image and give the extracted features as the input to the FRCNN Network. The feature extractor combines these two images into a 4 channel tensor and the output is a 5 channel tensor. It is this 5 channel tensor that I wish to give as input to the Faster RCNN Network.
I followed the PyTorch docs for Object Detection Finetuning (link here) and came up with the following code to suit my dataset.
class CustomDataset(torch.utils.data.Dataset):
def __getitem__(self, idx):
self.num_classes = 5
img_rgb_path = os.path.join(self.root, "rgb/", self.rgb_imgs[idx])
img_thermal_path = os.path.join(self.root, "thermal/", self.thermal_imgs[idx])
img_rgb = Image.open(img_rgb_path)
img_rgb = np.array(img_rgb)
x_rgb = TF.to_tensor(img_rgb)
x_rgb.unsqueeze_(0)
img_thermal = Image.open(img_thermal_path)
img_thermal = np.array(img_thermal)
img_thermal = np.expand_dims(img_thermal,-1)
x_th = TF.to_tensor(img_thermal)
x_th.unsqueeze_(0)
print(x_rgb.shape) # shape of [3,640,512]
print(x_th.shape) # shape of [1,640,512]
input = torch.cat((x_rgb,x_th),dim=1) # shape of [4,640,512]
img = self.feature_extractor(input) # My custom feature extractor which returns a 5 dimensional tensor
print(img.shape) # shape of [5,640,512]
filename = os.path.join(self.root,'annotations',self.annotations[idx])
tree = ET.parse(filename)
objs = tree.findall('object')
num_objs = len(objs)
boxes = np.zeros((num_objs, 4), dtype=np.uint16)
labels = np.zeros((num_objs), dtype=np.float32)
seg_areas = np.zeros((num_objs), dtype=np.float32)
boxes = []
for ix, obj in enumerate(objs):
bbox = obj.find('bndbox')
x1 = float(bbox.find('xmin').text)
y1 = float(bbox.find('ymin').text)
x2 = float(bbox.find('xmax').text)
y2 = float(bbox.find('ymax').text)
cls = self._class_to_ind[obj.find('name').text.lower().strip()]
boxes.append([x1, y1, x2, y2])
labels[ix] = cls
seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
boxes = torch.as_tensor(boxes, dtype=torch.float32)
seg_areas = torch.as_tensor(seg_areas, dtype=torch.float32)
labels = torch.as_tensor(labels, dtype=torch.float32)
target = {'boxes': boxes,
'labels': labels,
'seg_areas': seg_areas,
}
return img,target
My main function code is as follows
import utils
backbone = torchvision.models.mobilenet_v2(pretrained=True).features
backbone.out_channels = 1280
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
aspect_ratios=((0.5, 1.0, 2.0),))
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
output_size=7,
sampling_ratio=2)
num_classes = 5
model = FasterRCNN(backbone = backbone,num_classes=5,rpn_anchor_generator=anchor_generator,box_roi_pool=roi_pooler)
dataset = CustomDataset('train_folder/')
data_loader_train = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True,collate_fn=utils.collate_fn)
train_model(model, criterion, data_loader_train, num_epochs=10)
The collate_fn defined in the utils.py file is the following
def collate_fn(batch):
return tuple(zip(*batch))
I, however, get the following error while training
Traceback (most recent call last):
File "train.py", line 147, in <module>
train_model(model, criterion, data_loader_train, num_epochs)
File "train.py", line 58, in train_model
outputs = model(inputs, labels)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/generalized_rcnn.py", line 66, in forward
images, targets = self.transform(images, targets)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/transform.py", line 46, in forward
image = self.normalize(image)
File "/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/transform.py", line 66, in normalize
return (image - mean[:, None, None]) / std[:, None, None]
RuntimeError: The size of tensor a (5) must match the size of tensor b (3) at non-singleton dimension 0
Can someone please help me out? I am a newbie in Pytorch