Hi
I am running the llama2 LLM examples on my PC with Windows with a 7950x and 96GB DDR5, I also am using a 4090 GPU.
I can run the llama 2 model 7B using 1 for the MP value.
But to run the llama2 examples with the model 13b, I am setting the MP to 2. This results in an error from pytorch. Note I only have 1 GPU, and am using CUDA 11.8 with PyTorch in Conda - does 2 MP mean 2 GPUs - and this cant be done with consumer GPUs or in Windows ?
Command line is
python -m torchrun-script --nproc_per_node 2 example_text_completion.py --ckpt_dir …\llama-2-13b --tokenizer_path …\llama-2-13b\tokenizer.model --max_seq_len 128 --max_batch_size 4
Error is
torch._C._cuda_setDevice(device)
RuntimeError: CUDA error: invalid device ordinal
Full output is here
Cuda support: True : 1 devices
Cuda support: True : 1 devices
> initializing model parallel with size 2
> initializing ddp with size 1
> initializing pipeline with size 1
Traceback (most recent call last):
File "H:\llama2\repo\llama\example_text_completion.py", line 62, in <module>
fire.Fire(main)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\fire\core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\fire\core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\fire\core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "H:\llama2\repo\llama\example_text_completion.py", line 24, in main
generator = Llama.build(
File "H:\llama2\repo\llama\llama\generation.py", line 68, in build
torch.cuda.set_device(local_rank)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\cuda\__init__.py", line 350, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24900 closing signal CTRL_C_EVENT
Traceback (most recent call last):
File "H:\llama2\repo\llama\example_text_completion.py", line 62, in <module>
fire.Fire(main)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\fire\core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\fire\core.py", line 475, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\fire\core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "H:\llama2\repo\llama\example_text_completion.py", line 24, in main
generator = Llama.build(
File "H:\llama2\repo\llama\llama\generation.py", line 83, in build
checkpoint = torch.load(ckpt_path, map_location="cpu")
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\serialization.py", line 809, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\serialization.py", line 1172, in _load
result = unpickler.load()
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\serialization.py", line 1142, in persistent_load
typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\serialization.py", line 1112, in load_tensor
storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage
KeyboardInterrupt
WARNING:torch.distributed.elastic.agent.server.api:Received 2 death signal, shutting down workers
Traceback (most recent call last):
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\runpy.py", line 86, in _run_code
exec(code, run_globals)
File "U:\Miniconda3\envs\llama2env\Scripts\torchrun-script.py", line 33, in <module>
sys.exit(load_entry_point('torch==2.0.1', 'console_scripts', 'torchrun')())
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\multiprocessing\errors\__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\run.py", line 794, in main
run(args)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\run.py", line 785, in run
elastic_launch(
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\launcher\api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\launcher\api.py", line 241, in launch_agent
result = agent.run()
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\metrics\api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\agent\server\api.py", line 723, in run
result = self._invoke_run(role)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\agent\server\api.py", line 865, in _invoke_run
run_result = self._monitor_workers(self._worker_group)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\metrics\api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\agent\server\local_elastic_agent.py", line 306, in _monitor_workers
result = self._pcontext.wait(0)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\multiprocessing\api.py", line 288, in wait
return self._poll()
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\multiprocessing\api.py", line 664, in _poll
self.close() # terminate all running procs
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\multiprocessing\api.py", line 331, in close
self._close(death_sig=death_sig, timeout=timeout)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\multiprocessing\api.py", line 708, in _close
handler.proc.wait(time_to_wait)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\subprocess.py", line 1209, in wait
return self._wait(timeout=timeout)
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\subprocess.py", line 1506, in _wait
result = _winapi.WaitForSingleObject(self._handle,
File "U:\Miniconda3\envs\llama2CUDA11.8\lib\site-packages\torch\distributed\elastic\multiprocessing\api.py", line 62, in _terminate_process_handler
raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
torch.distributed.elastic.multiprocessing.api.SignalException: Process 20724 got signal: 2