Conflicts between torch-1.10.2+cu113 and python multiprocessing

import os, re, sys, time
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(['0', '1', '2', '3'])

class Model(nn.Module):

    def __init__(self, params=None):
        super().__init__()
        input_shape = params.get('input_size')
        output_shape = params.get('output_size')
        self.gru = nn.GRU(input_shape, output_shape, batch_first=True) # Note

    def forward(self, x):
        y = self.gru(x)
        return y

# sub-process job
def single_job(cuda_id, params):
    device = torch.device(cuda_id)
    model = Model(params)
    model.to(device)
    for _ in range(1000):
        X = torch.rand(4096, 16, 64).float()
        X = X.to(device)
        y_pred = model(X) # Note
        time.sleep(1)

# main process
with ProcessPoolExecutor(max_workers=2) as executor:
    for i in [1, 2]:
        cuda_id = f'cuda:{i}'
        params = {'input_size': 64, 'output_size': 1}
        sub = executor.submit(single_job, cuda_id, params)

However, when it runs, the GPU occupation could be weird as shown below.

At the begining, the gpu occupation shown in nvidia-smi page is normal, with gpu 1 and 2 are occupied separately.

But after a few seconds, another two process is generated and occupy gpu 0 unexpectedly.
This problem is only caused by GRU layer (including other RNN layer) in model propagation as shown in the source code above.

The code runs correctly with torch-1.7 and corresponding cuda version.