I am working on a model to convert mono frontal image to a bird eye view.
My plan is to use ResNet50 as a features extractor for each object and combine the features with coordinates of the bounding box and the output would be the bounding box coordinate in the bird eye view. I’ve built my model using following code:
class DriverBEV(nn.Module): """ This model is designed to estimate the location of objects in the bird eye view using images and coordinates of objects in the frontal view """ def __init__(self): super(DriverBEV, self).__init__() # we are using ResNet50 to extract object features self.resnet50 = models.resnet50(pretrained=True) # get the second-to-last layer to extract the features self.resnet50 = nn.Sequential(*list(self.resnet50.children())[:-1]) # disable gradient computation for param in self.resnet50.parameters(): param.requires_grad = False # starting to build a fully connected net to intake the coordinates self.fc1 = nn.Linear(4, 64) self.fc2 = nn.Linear(64, 256) self.fc3 = nn.Linear(256,500) self.fc4 = nn.Linear(2548, 1024) self.fc5 = nn.Linear(1024, 512) self.fc6 = nn.Linear(512, 256) self.fc7 = nn.Linear(256, 128) self.fc8 = nn.Linear(128, 4) self.dropout = nn.Dropout(0.25) def forward(self, image, coordinates): # extract img's features features = self.resnet50(image) coord_enco = self.dropout(F.relu(self.fc1(coordinates))) coord_enco = self.dropout(F.relu(self.fc2(coord_enco))) coord_enco = self.dropout(F.relu(self.fc3(coord_enco))) merge = torch.cat((features, coord_enco), 1) cood_dec = self.dropout(F.relu(self.fc4(merge))) cood_dec = self.dropout(F.relu(self.fc5(cood_dec))) cood_dec = self.dropout(F.relu(self.fc6(cood_dec))) cood_dec = self.dropout(F.relu(self.fc7(cood_dec))) cood_dec = self.fc8(cood_dec) return cood_dec
I am not sure if my model is correct, any suggestions/corrections would be appreciated.