How do you load a specific GPU from CUDA_AVAILABLE_DEVICES in PyTorch?

Brando_Miranda · March 4, 2023, 9:57pm

I came up with this code but it’s resulting in never ending bugs:

def get_device_via_env_variables(deterministic: bool = False, verbose: bool = True) -> torch.device:
    device: torch.device = torch.device("cpu")
    if torch.cuda.is_available():
        if 'CUDA_VISIBLE_DEVICES' not in os.environ:
            device: torch.device = torch.device("cuda:0")
        else:
            gpu_idx: list[str] = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
            if len(gpu_idx) == 1:
                gpu_idx: str = gpu_idx[0]
            else:
                # generate random int from 0 to len(gpu_idx) with import statement
                import random
                idx: int = random.randint(0, len(gpu_idx) - 1) if not deterministic else -1
                gpu_idx: str = gpu_idx[idx]
            device: torch.device = torch.device(f"cuda:{gpu_idx}")
    if verbose:
        print(f'{device=}')
    return device

I have a suspicion that the gpu_idx and CUDA_VISIBLE_DEVICES don’t actually match…I just want to load the right GPU. How do I do that?

error:

Traceback (most recent call last):aded (0.000 MB deduped)
  File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in <module>
    main_data_analyis()
  File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1163, in main_data_analyis
    args: Namespace = load_args()
  File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1152, in load_args
    args.meta_learner = get_maml_meta_learner(args)
  File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 272, in get_maml_meta_learner
    base_model = load_model_ckpt(args, path_to_checkpoint=args.path_2_init_maml)
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 265, in load_model_ckpt
    base_model, _, _ = load_model_optimizer_scheduler_from_ckpt(args, path_to_checkpoint,
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 81, in load_model_optimizer_scheduler_from_ckpt
    ckpt: dict = torch.load(path_to_checkpoint, map_location=torch.device('cuda:3'))
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 607, in load
    return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 882, in _load
    result = unpickler.load()
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 857, in persistent_load
    load_tensor(data_type, size, key, _maybe_decode_ascii(location))
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 846, in load_tensor
    loaded_storages[key] = restore_location(storage, location)
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 827, in restore_location
    return default_restore_location(storage, str(map_location))
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 175, in default_restore_location
    result = fn(storage, location)
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 151, in _cuda_deserialize
    device = validate_cuda_device(location)
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 142, in validate_cuda_device
    raise RuntimeError('Attempting to deserialize object on CUDA device '
RuntimeError: Attempting to deserialize object on CUDA device 3 but torch.cuda.device_count() is 1. Please use torch.load with map_location to map your storages to an existing device.

motivated by the fact that I am trying to use the remainign 40GB from my 5CNN with 256 & 512 filters but results it memory issues

Traceback (most recent call last):
  File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in <module>
    main_data_analyis()
  File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1213, in main_data_analyis
    stats_analysis_with_emphasis_on_effect_size(args, hist=True)
  File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/stats_analysis_with_emphasis_on_effect_size.py", line 74, in stats_analysis_with_emphasis_on_effect_size
    results_usl: dict = get_episodic_accs_losses_all_splits_usl(args, args.mdl_sl, loaders)
  File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 616, in get_episodic_accs_losses_all_splits_usl
    losses, accs = agent.get_lists_accs_losses(data, training)
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 92, in get_lists_accs_losses
    spt_embeddings_t = self.get_embedding(spt_x_t, self.base_model).detach()
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 166, in get_embedding
    return get_embedding(x=x, base_model=base_model)
  File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 267, in get_embedding
    out = base_model.model.features(x)
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
    input = module(input)
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 439, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
RuntimeError: CUDA out of memory. Tried to allocate 174.00 MiB (GPU 0; 79.20 GiB total capacity; 54.31 GiB already allocated; 22.56 MiB free; 54.61 GiB reserved in total by PyTorch)

I want to use GPU 3 but the last error say GPU 0. What am I doing wrong?

cross: python - How do you load a specific GPU from CUDA_AVAILABLE_DEVICES in PyTorch? - Stack Overflow

ptrblck · March 4, 2023, 10:32pm

CUDA_VISIBLE_DEVICES is used to specify the desired available GPUs which will be mapped to cuda:0, cuda:1, etc. inside the application. Based on the error message you are seeing I guess you expect something like this to work:

CUDA_VISIBLE_DEVICES=3 python script.py

# inside script.py
x = torch.randn(1).to("cuda:3") # this will fail
x = torch.randn(1).to("cuda:0") # this will work and create the tensor on GPU3