I used to use torch.cuda.can_device_access_peer
to test p2p access, and I found it cost much memory in multiple processes.
Then I figured it out, that this is not a pytorch problem:
def main(env):
import ctypes
libcudart = ctypes.CDLL('libcudart.so')
libcudart.cudaDeviceCanAccessPeer.restype = ctypes.c_int
libcudart.cudaDeviceCanAccessPeer.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]
libcudart.cudaMemGetInfo.restype = ctypes.c_int
libcudart.cudaMemGetInfo.argtypes = [ctypes.POINTER(ctypes.c_size_t), ctypes.POINTER(ctypes.c_size_t)]
libcudart.cudaRuntimeGetVersion.restype = ctypes.c_int
libcudart.cudaRuntimeGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
import os
for k, v in env.items():
os.environ[k] = v
rank = int(os.environ.get('RANK', 0))
world_size = int(os.environ.get('WORLD_SIZE', 1))
def mem_get_info():
free_mem = ctypes.c_size_t()
total_mem = ctypes.c_size_t()
result = libcudart.cudaMemGetInfo(ctypes.byref(free_mem), ctypes.byref(total_mem))
assert result == 0
return free_mem.value, total_mem.value
def detect_p2p(i, j):
canAccessPeer = ctypes.c_int()
device = ctypes.c_int(i) # Example device ID
peerDevice = ctypes.c_int(j) # Example peer device ID
result = libcudart.cudaDeviceCanAccessPeer(ctypes.byref(canAccessPeer), device, peerDevice)
assert result == 0
return canAccessPeer.value
def get_version():
version = ctypes.c_int()
result = libcudart.cudaRuntimeGetVersion(ctypes.byref(version))
assert result == 0
version = version.value
return f"{version // 1000}.{version // 10}"
def report_cuda_memory(msg=""):
free, total = mem_get_info()
used = total - free
print(f"{rank=} {msg}: Total memory: {total / 1024 ** 3} GB; Used memory: {used / 1024 ** 3} GB")
print(f"version={get_version()}")
for i in range(world_size):
if i == rank:
continue
report_cuda_memory(f"before test {(rank, i)}")
ans = detect_p2p(rank, i)
print(f"p2p access: {ans}")
report_cuda_memory(f"after test {(rank, i)}")
if __name__ == "__main__":
from multiprocessing import Process
ps = []
n = 4
for i in range(n):
p = Process(target=main, args=({"WORLD_SIZE": f"{n}", "RANK": f"{i}"},))
ps.append(p)
for p in ps:
p.start()
for p in ps:
p.join()
version=12.1204
version=12.1204
version=12.1204
version=12.1204
rank=0 before test (0, 1): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=0 after test (0, 1): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=0 before test (0, 2): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=0 after test (0, 2): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=0 before test (0, 3): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=0 after test (0, 3): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=2 before test (2, 0): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=2 after test (2, 0): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=2 before test (2, 1): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=2 after test (2, 1): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=2 before test (2, 3): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
p2p access: 1
rank=2 after test (2, 3): Total memory: 31.7325439453125 GB; Used memory: 0.67938232421875 GB
rank=1 before test (1, 0): Total memory: 31.7325439453125 GB; Used memory: 0.95281982421875 GB
p2p access: 1
rank=1 after test (1, 0): Total memory: 31.7325439453125 GB; Used memory: 0.95477294921875 GB
rank=1 before test (1, 2): Total memory: 31.7325439453125 GB; Used memory: 0.95477294921875 GB
p2p access: 1
rank=1 after test (1, 2): Total memory: 31.7325439453125 GB; Used memory: 0.95477294921875 GB
rank=1 before test (1, 3): Total memory: 31.7325439453125 GB; Used memory: 0.95672607421875 GB
p2p access: 1
rank=1 after test (1, 3): Total memory: 31.7325439453125 GB; Used memory: 0.95672607421875 GB
rank=3 before test (3, 0): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
p2p access: 1
rank=3 after test (3, 0): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
rank=3 before test (3, 1): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
p2p access: 1
rank=3 after test (3, 1): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
rank=3 before test (3, 2): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
p2p access: 1
rank=3 after test (3, 2): Total memory: 31.7325439453125 GB; Used memory: 0.90203857421875 GB
This only happens when I have multiple processes. When I only have one process and test the p2p access, memory usage does not increase.