Here is a TF code that creates 2 logical GPUs from 1 physical GPU:
gpus = tf.config.list_physical_devices("GPU")
if gpus:
# Create 2 virtual GPUs with 1GB memory each
try:
tf.config.set_logical_device_configuration(
gpus[0],
[tf.config.LogicalDeviceConfiguration(memory_limit=1024),
tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
logical_gpus = tf.config.list_logical_devices("GPU")
print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
# Virtual devices must be set before GPUs have been initialized
print(e)
EDIT: However, TF says itself that NCCL is not supported:
WARNING:tensorflow:NCCL is not supported when using virtual GPUs, fallingback to reduction to one device
Using “gloo” backend we can create N processes using the same GPU and perform collective ops:
# Run as
# torchrun --nproc_per_node=4 test_virtual_gpus.py
import os
import time
import torch
import torch.distributed as dist
import torch.distributed.distributed_c10d as c10d
def pprint(msg):
dist.get_rank()
# We add sleep to avoid printing clutter
time.sleep(0.5 * rank)
print(rank, msg)
if __name__ == "__main__":
# See https://pytorch.org/docs/stable/distributed.html#which-backend-to-use
dist.init_process_group("gloo")
rank = dist.get_rank()
ws = dist.get_world_size()
pprint(f"Hello from process {rank} / {os.environ['LOCAL_RANK']} among {ws} others")
t = torch.tensor(rank, device="cuda")
pprint(t)
dist.all_reduce(t)
pprint(f"All reduced output: {t}")
dist.destroy_process_group()
Output:
WARNING:torch.distributed.run:
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
0 Hello from process 0 / 0 among 4 others
1 Hello from process 1 / 1 among 4 others
2 Hello from process 2 / 2 among 4 others
0 tensor(0, device='cuda:0')
3 Hello from process 3 / 3 among 4 others
1 tensor(1, device='cuda:0')
2 tensor(2, device='cuda:0')
3 tensor(3, device='cuda:0')
0 All reduced output: 6
1 All reduced output: 6
2 All reduced output: 6
3 All reduced output: 6
PS: Keeping in mind that all NCCL-related wrappers and ops wont work in such configuration.