Region proposal network and roialign

Hi, I hope someone helps me. I want to get the top 10 proposals from rpn (pretraind) provided in pytorch models. then i want to use the bounding boxes of the proposals as well as the combined boxes(pf two proposals) in roi align. I think something went wrong because the accuracy of my model is reached only 10%. may be the coordinate format is not correct or may be my way to achieve this task is not efficient. I am sharing the code so I hope i get help as I looked over all tutorial but I have not found any useful info. Thank you in advance. This is my code:

 class Proposals(nn.Module):
def __init__(self):
    
    super(Proposals, self).__init__()
    resnet = tmodels.resnet50(pretrained=True)#backbone
    modules = list(resnet.children())[:-1]      # delete the last fc layer.
    self.feature_Extraction = nn.Sequential(*modules)
    self.model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)#faster_rcnn to get the pretrained rpn from
def combine_box(self,box1, box2):# function to combine two proposals bounding box
  return np.hstack((np.minimum(box1[:2], box2[:2]), np.maximum(box1[2:], box2[2:])))      
def forward(self, x_3d):
    #x_3d is (batch_size,number of frames,channel,w,h)
    device = torch.device("cuda:0")

    model=self.model.eval()#faster_rcnn
    batches_proposals=[]
    for batch_idx in range(x_3d.size(0)): #loop over batches
        frames_proposals=[]
        for iy in range(x_3d.size(1)):
           new_tensor=x_3d[batch_yarb,iy, :, :, :]
          
           with torch.no_grad():
                outputs=[]
                features=self.feature_Extraction(new_tensor.unsqueeze(0))#get features for each frame
             
                hook=model.rpn.register_forward_hook(lambda self, input,output:outputs.append(output))#get rpn from faster_rcnn
              )
                res=model(torch.as_tensor(new_tensor.unsqueeze(0)))
                hook.remove()
                boxes=outputs[0][0][0][0:10]# get top 10 propsals bounding boxes size(N,4) for each frame
           
                boxes[:, 0::2] = (boxes[:, 0::2]).clamp(min=0, max=224)# here we clip the proposal to be within image size (224,224)
                boxes[:, 1::2] = (boxes[:, 1::2]).clamp(min=0, max=224)
                boxes=boxes.tolist()
                roialign=[]
                
                roialign_y=[]
                for i in range(len(boxes)):

                    x1, y1, x2, y2=int(boxes[i][0]),int(boxes[i][1]),int(boxes[i][2]),int(boxes[i][3])
                    a=[x1, y1, x2,y2]# this format needed as input for roialign
                    y=a
                    y=[0]+y[0:]# we make of size(N,5) where the first column of 5 is batch index. so since we are feeding one image each time we consider it as 0 all time.
                   # print(yarb)
                    y=torch.FloatTensor(y)
                    roialign_y.append(y) 
                    
                    a=torch.FloatTensor(a)
                    roialign.append(a)
                all_nodes=[]
                new_list=[]
            
                for idxi, i in enumerate(roialign):#this loop to combine proposals bounding boxes
    
                    for idxj, j in enumerate(roialign):
                
                        if idxi!=idxj:

                            
                            pair=self.combine_box(i,j)

                            pair_y=pair.tolist()
                            pair_y=[0]+ pair_y[0:]
                
                            pair=torch.FloatTensor(pair_y)

                            all_nodes.append(pair)
             
                new_list.extend(roialign_y)

                new_list.extend(all_nodes)
                roialign_t=torch.stack(new_list,dim=0)
                roialign_t=roialign_t.type(torch.float32)

                roialign_t=roialign_t.cuda()
                all_proposals=[]
                features=features.cuda()#features of the frame
              
                m=torchvision.ops.roi_align(features, roialign_t, (7,7),spatial_scale=0.004 ,sampling_ratio=-1)# i made spatial_scale as 0.004 because the output features of frames is 2048*1*1 and the image size is 224*224. by divide 1/224=0.004(may be not correct?)
             
                all_proposals.append(m)#list contain the features of all proposals in a frame 
                tensor_proposals=torch.stack(all_proposals,dim=0)# size(1,N,c,7,7)#this for each frame(c=2048)N=number of proposals
                tensor_proposals=tensor_proposals.squeeze(0)
                maxp=nn.MaxPool2d(7,7)
               
                maxpooled=maxp(tensor_proposals)#max pooling
                frames_proposals.append(maxpooled)
               # print('here average',len(frames_proposals))
        frames_all_proposals=torch.stack(frames_proposals,dim=0)#(number of frames,N,c,1,1)

        batches_proposals.append(frames_all_proposals)
    batches_all_y=torch.stack(batches_proposals,dim=0)#((batch_size,number of frames,N,c,1,1))
    
    return batches_all_y