Hi, I hope someone helps me. I want to get the top 10 proposals from rpn (pretraind) provided in pytorch models. then i want to use the bounding boxes of the proposals as well as the combined boxes(pf two proposals) in roi align. I think something went wrong because the accuracy of my model is reached only 10%. may be the coordinate format is not correct or may be my way to achieve this task is not efficient. I am sharing the code so I hope i get help as I looked over all tutorial but I have not found any useful info. Thank you in advance. This is my code:
class Proposals(nn.Module):
def __init__(self):
super(Proposals, self).__init__()
resnet = tmodels.resnet50(pretrained=True)#backbone
modules = list(resnet.children())[:-1] # delete the last fc layer.
self.feature_Extraction = nn.Sequential(*modules)
self.model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)#faster_rcnn to get the pretrained rpn from
def combine_box(self,box1, box2):# function to combine two proposals bounding box
return np.hstack((np.minimum(box1[:2], box2[:2]), np.maximum(box1[2:], box2[2:])))
def forward(self, x_3d):
#x_3d is (batch_size,number of frames,channel,w,h)
device = torch.device("cuda:0")
model=self.model.eval()#faster_rcnn
batches_proposals=[]
for batch_idx in range(x_3d.size(0)): #loop over batches
frames_proposals=[]
for iy in range(x_3d.size(1)):
new_tensor=x_3d[batch_yarb,iy, :, :, :]
with torch.no_grad():
outputs=[]
features=self.feature_Extraction(new_tensor.unsqueeze(0))#get features for each frame
hook=model.rpn.register_forward_hook(lambda self, input,output:outputs.append(output))#get rpn from faster_rcnn
)
res=model(torch.as_tensor(new_tensor.unsqueeze(0)))
hook.remove()
boxes=outputs[0][0][0][0:10]# get top 10 propsals bounding boxes size(N,4) for each frame
boxes[:, 0::2] = (boxes[:, 0::2]).clamp(min=0, max=224)# here we clip the proposal to be within image size (224,224)
boxes[:, 1::2] = (boxes[:, 1::2]).clamp(min=0, max=224)
boxes=boxes.tolist()
roialign=[]
roialign_y=[]
for i in range(len(boxes)):
x1, y1, x2, y2=int(boxes[i][0]),int(boxes[i][1]),int(boxes[i][2]),int(boxes[i][3])
a=[x1, y1, x2,y2]# this format needed as input for roialign
y=a
y=[0]+y[0:]# we make of size(N,5) where the first column of 5 is batch index. so since we are feeding one image each time we consider it as 0 all time.
# print(yarb)
y=torch.FloatTensor(y)
roialign_y.append(y)
a=torch.FloatTensor(a)
roialign.append(a)
all_nodes=[]
new_list=[]
for idxi, i in enumerate(roialign):#this loop to combine proposals bounding boxes
for idxj, j in enumerate(roialign):
if idxi!=idxj:
pair=self.combine_box(i,j)
pair_y=pair.tolist()
pair_y=[0]+ pair_y[0:]
pair=torch.FloatTensor(pair_y)
all_nodes.append(pair)
new_list.extend(roialign_y)
new_list.extend(all_nodes)
roialign_t=torch.stack(new_list,dim=0)
roialign_t=roialign_t.type(torch.float32)
roialign_t=roialign_t.cuda()
all_proposals=[]
features=features.cuda()#features of the frame
m=torchvision.ops.roi_align(features, roialign_t, (7,7),spatial_scale=0.004 ,sampling_ratio=-1)# i made spatial_scale as 0.004 because the output features of frames is 2048*1*1 and the image size is 224*224. by divide 1/224=0.004(may be not correct?)
all_proposals.append(m)#list contain the features of all proposals in a frame
tensor_proposals=torch.stack(all_proposals,dim=0)# size(1,N,c,7,7)#this for each frame(c=2048)N=number of proposals
tensor_proposals=tensor_proposals.squeeze(0)
maxp=nn.MaxPool2d(7,7)
maxpooled=maxp(tensor_proposals)#max pooling
frames_proposals.append(maxpooled)
# print('here average',len(frames_proposals))
frames_all_proposals=torch.stack(frames_proposals,dim=0)#(number of frames,N,c,1,1)
batches_proposals.append(frames_all_proposals)
batches_all_y=torch.stack(batches_proposals,dim=0)#((batch_size,number of frames,N,c,1,1))
return batches_all_y