Problems with torch.multiprocessing on CPU

Unfortunately, for quite some time now, I have encountered problems with the module torch.multiprocessing. As a MWE, I am trying to square a PyTorch tensor on CPU, which does not work:

import torch
import numpy as np
import torch.multiprocessing as mp


def square(i, x, queue):
    print('In process {}'.format(i))
    if isinstance(x, np.ndarray):
        queue.put(np.square(x))
    else:
        queue.put(torch.square(x))


if __name__ == '__main__':
    mp.set_start_method('spawn')
    print('Number of available CPUs: {}'.format(mp.cpu_count()))

    processes = []
    queue = mp.Queue()

    # Set up some sample data:
    x = np.arange(64)

    # Start <mp.cpu_count()> processes with the square function as the target and an invidual piece of data to process:
    for i in range(mp.cpu_count()):
        start_index = mp.cpu_count()*i
        proc = mp.Process(target = square, args = (i, x[start_index:start_index+mp.cpu_count()], queue))
        proc.start()
        processes.append(proc)

    # Wait for each process to finish before returning to the main thread:
    for proc in processes:
        proc.join()

    # Terminate each process:
    for proc in processes:
        proc.terminate()

    # Convert the multiprocessing queue into a list:
    results = []
    while not queue.empty():
        results.append(queue.get())

    print(results)

This results in the following error message (Python version 3.8.9, PyTorch version 1.9.0):

Traceback (most recent call last):
  File "parallelization_sampling.py", line 55, in <module>
    results.append(queue.get())
  File "/..../lib/python3.8/multiprocessing/queues.py", line 116, in get
    return _ForkingPickler.loads(res)
  File "/..../lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
    fd = df.detach()
  File "/..../lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/.../lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process.current_process().authkey)
  File "/.../lib/python3.8/multiprocessing/connection.py", line 502, in Client
    c = SocketClient(address)
  File "/.../lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
    s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused

However, when I import the module multiprocessing instead of torch.multiprocessing and use x = np.arange(64) instead of x = torch.arange(64), the script works fine! I tested the code on two machines, can anybody confirm this weird behavior? And does anybody see what I am doing wrongly? Or is torch.multiprocessing not supposed to be used on CPU?