Why cudaDeviceCanAccessPeer costs much memory?

I used to use torch.cuda.can_device_access_peer to test p2p access, and I found it cost much memory in multiple processes.

Then I figured it out, that this is not a pytorch problem:

def main(env):
    import ctypes
    libcudart = ctypes.CDLL('libcudart.so')
    libcudart.cudaDeviceCanAccessPeer.restype = ctypes.c_int
    libcudart.cudaDeviceCanAccessPeer.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]

    libcudart.cudaMemGetInfo.restype = ctypes.c_int
    libcudart.cudaMemGetInfo.argtypes = [ctypes.POINTER(ctypes.c_size_t), ctypes.POINTER(ctypes.c_size_t)]

    libcudart.cudaRuntimeGetVersion.restype = ctypes.c_int
    libcudart.cudaRuntimeGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]

    import os
    for k, v in env.items():
        os.environ[k] = v
    rank = int(os.environ.get('RANK', 0))
    world_size = int(os.environ.get('WORLD_SIZE', 1))

    def mem_get_info():
        free_mem = ctypes.c_size_t()
        total_mem = ctypes.c_size_t()
        result = libcudart.cudaMemGetInfo(ctypes.byref(free_mem), ctypes.byref(total_mem))
        assert result == 0
        return free_mem.value, total_mem.value

    def detect_p2p(i, j):
        canAccessPeer = ctypes.c_int()
        device = ctypes.c_int(i)  # Example device ID
        peerDevice = ctypes.c_int(j)  # Example peer device ID
        result = libcudart.cudaDeviceCanAccessPeer(ctypes.byref(canAccessPeer), device, peerDevice)
        assert result == 0
        return canAccessPeer.value

    def get_version():
        version = ctypes.c_int()
        result = libcudart.cudaRuntimeGetVersion(ctypes.byref(version))
        assert result == 0
        version = version.value

        return f"{version // 1000}.{version // 10}"

    def report_cuda_memory(msg=""):
        free, total = mem_get_info()
        used = total - free
        print(f"{rank=} {msg}: Total memory: {total / 1024 ** 3} GB; Used memory: {used / 1024 ** 3} GB")

    print(f"version={get_version()}")

    for i in range(world_size):
        if i == rank:
            continue
        report_cuda_memory(f"before test {(rank, i)}")
        ans = detect_p2p(rank, i)
        print(f"p2p access: {ans}")
        report_cuda_memory(f"after test {(rank, i)}")

if __name__ == "__main__":
    from multiprocessing import Process
    ps = []
    n = 4
    for i in range(n):
        p = Process(target=main, args=({"WORLD_SIZE": f"{n}", "RANK": f"{i}"},))
        ps.append(p)
    for p in ps:
        p.start()
    for p in ps:
        p.join()
version=12.1204
version=12.1204
version=12.1204
version=12.1204
rank=0 before test (0, 1): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=0 after test (0, 1): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=0 before test (0, 2): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=0 after test (0, 2): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=0 before test (0, 3): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=0 after test (0, 3): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=2 before test (2, 0): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=2 after test (2, 0): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=2 before test (2, 1): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=2 after test (2, 1): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=2 before test (2, 3): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=2 after test (2, 3): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=1 before test (1, 0): Total memory: 31.7325439453125 GB; Used memory: 0.95281982421875 GB
p2p access: 1
rank=1 after test (1, 0): Total memory: 31.7325439453125 GB; Used memory: 0.95477294921875 GB
rank=1 before test (1, 2): Total memory: 31.7325439453125 GB; Used memory: 0.95477294921875 GB
p2p access: 1
rank=1 after test (1, 2): Total memory: 31.7325439453125 GB; Used memory: 0.95477294921875 GB
rank=1 before test (1, 3): Total memory: 31.7325439453125 GB; Used memory: 0.95672607421875 GB
p2p access: 1
rank=1 after test (1, 3): Total memory: 31.7325439453125 GB; Used memory: 0.95672607421875 GB
rank=3 before test (3, 0): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
p2p access: 1
rank=3 after test (3, 0): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
rank=3 before test (3, 1): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
p2p access: 1
rank=3 after test (3, 1): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
rank=3 before test (3, 2): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
p2p access: 1
rank=3 after test (3, 2): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB

This only happens when I have multiple processes. When I only have one process and test the p2p access, memory usage does not increase.

You are creating multiple CUDA context on the default device:

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A    672296      C   python                                      306MiB |
|    0   N/A  N/A    672297      C   python                                      306MiB |
|    0   N/A  N/A    672298      C   python                                      306MiB |
|    0   N/A  N/A    672299      C   python                                      306MiB |
+---------------------------------------------------------------------------------------+

thus increasing its memory usage.

Is this the expected behavior? And why memory usage does not increase when I have one process and test the p2p access for all GPUs?

Yes, it’s expected behavior since you are initializing a CUDA context in each process via:

result = libcudart.cudaMemGetInfo(ctypes.byref(free_mem), ctypes.byref(total_mem))

so your actual test creates the issue.
If I remove the mem_get_info calls, I see:

Mon Apr 15 17:17:50 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   36C    P0    44W / 300W |      3MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   36C    P0    45W / 300W |      3MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   36C    P0    44W / 300W |      3MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  Tesla V100-SXM2...  On   | 00000000:0B:00.0 Off |                    0 |
| N/A   33C    P0    45W / 300W |      3MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   4  Tesla V100-SXM2...  On   | 00000000:85:00.0 Off |                    0 |
| N/A   36C    P0    43W / 300W |      3MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   5  Tesla V100-SXM2...  On   | 00000000:86:00.0 Off |                    0 |
| N/A   36C    P0    41W / 300W |      3MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   6  Tesla V100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |
| N/A   37C    P0    45W / 300W |      3MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   7  Tesla V100-SXM2...  On   | 00000000:8A:00.0 Off |                    0 |
| N/A   37C    P0    42W / 300W |      3MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+