Multprocessing inference uses more memory of gpu0

I use 2 GPUs for multi-process inference. I load models in the main process and put them to different GPUs in the sub-processes. However, I find that the processes of gpu1 always occupy 153M memory of gpu0.

    import torch.multiprocessing as multiprocessing
    multiprocessing.set_sharing_strategy('file_system')

    ...

    def main(self):
            ...

            self.create_model()

            multiprocessing.set_start_method('spawn', force=True)
            print('>>> multi-processes running <<<')

            param_list = [([video_idx, video_name, im_list, gt_list, lang, len(seqs)],
                           self.tester_cfg.num_gpu)
                          for video_idx, (video_name, im_list, gt_list, lang) in enumerate(seqs)]

            with multiprocessing.Pool(processes=self.tester_cfg.num_process) as pool:
                pool.starmap(self.sub, param_list)        


    def sub(self, seq_info, num_gpu):
        video_idx, video_name, im_list, gt_list, lang, seq_num = seq_info

        worker_name = multiprocessing.current_process().name
        worker_id = int(worker_name.split('-')[1]) - 1
        gpu_id = worker_id % num_gpu
        torch.cuda.set_device(gpu_id)
        rank = worker_id
        print('start rank {} on gpu {}'.format(rank, gpu_id))

        self.model_to_device()
        
        ...