I know this sounds a bit strange, I will provide a reproducible script.
My package version is as follows:
numpy == 1.26.4
torch == 2.3.0+cu118
deepspeed == 0.14.2
transformers == 4.35.2
First we wrap the model initialized by deepspeed with DDP, and then use it to infer data of shape (150, 35).
import numpy as np
import torch
import deepspeed
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoConfig, AutoModelForCausalLM
def setup(rank, world_size, port):
dist.init_process_group("nccl",
rank=rank,
world_size=world_size,
init_method=f'tcp://127.0.0.1:{port}'
)
torch.cuda.set_device(rank)
def inference(rank):
setup(rank, world_size=1, port=12355)
config = AutoConfig.from_pretrained("hugg_models/llama-7b")
llm = AutoModelForCausalLM.from_pretrained("hugg_models/llama-7b",
low_cpu_mem_usage=True,
config=config)
ds_model = deepspeed.init_inference(
llm,
tensor_parallel={"tp_size": 1},
dtype=torch.bfloat16,
replace_method="auto",
replace_with_kernel_inject=True
)
llm = DDP(ds_model.module, device_ids=[rank], output_device=rank)
input_ids= torch.from_numpy(np.random.randint(3, 10000, size=(150, 35))).to(rank)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
output_ids = llm.module.generate(
input_ids,
attention_mask=attention_mask,
do_sample=True,
temperature=0.2,
bos_token_id=1,
eos_token_id=2,
pad_token_id=0,
max_new_tokens=37,
use_cache=True,
)
if __name__ == "__main__":
world_size = 1
port = 12355
inference(rank=0)
pass
When running the above code, we can view the GPU memory usage as below.
[0] NVIDIA RTX A6000 | 37°C, 0 % | 38260 / 49140 MB |
[1] NVIDIA RTX A6000 | 31°C, 0 % | 2 / 49140 MB |
[2] NVIDIA RTX A6000 | 28°C, 0 % | 2 / 49140 MB |
[3] NVIDIA RTX A6000 | 30°C, 0 % | 2 / 49140 MB |
[4] NVIDIA RTX A6000 | 30°C, 0 % | 2 / 49140 MB |
[5] NVIDIA RTX A6000 | 32°C, 0 % | 2 / 49140 MB |
[6] NVIDIA RTX A6000 | 30°C, 0 % | 2 / 49140 MB |
[7] NVIDIA RTX A6000 | 33°C, 0 % | 2 / 49140 MB |
=======================================
Next, I modified the code slightly to remove the use of DDP
import numpy as np
import torch
import deepspeed
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoConfig, AutoModelForCausalLM
def setup(rank, world_size, port):
dist.init_process_group("nccl",
rank=rank,
world_size=world_size,
init_method=f'tcp://127.0.0.1:{port}'
)
torch.cuda.set_device(rank)
def inference(rank):
setup(rank, world_size=1, port=12355)
config = AutoConfig.from_pretrained("hugg_models/llama-7b")
llm = AutoModelForCausalLM.from_pretrained("hugg_models/llama-7b",
low_cpu_mem_usage=True,
config=config)
ds_model = deepspeed.init_inference(
llm,
tensor_parallel={"tp_size": 1},
dtype=torch.bfloat16,
replace_method="auto",
replace_with_kernel_inject=True
)
llm = ds_model.module
input_ids= torch.from_numpy(np.random.randint(3, 10000, size=(150, 35))).to(rank)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
output_ids = llm.generate( # repetition_penalty
input_ids,
attention_mask=attention_mask,
do_sample=True,
temperature=0.2,
bos_token_id=1,
eos_token_id=2,
pad_token_id=0,
max_new_tokens=37,
use_cache=True,
)
if __name__ == "__main__":
world_size = 1
port = 12355
inference(rank=0)
pass
Running the code, we get the following error:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/transformers/generation/utils.py", line 1719, in generate
return self.sample(
File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/transformers/generation/utils.py", line 2801, in sample
outputs = self(
File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 1053, in forward
logits = logits.float()
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 642.00 MiB. GPU
At this time, the GPU memory usage is
[0] NVIDIA RTX A6000 | 35°C, 0 % | 48319 / 49140 MB |
[1] NVIDIA RTX A6000 | 31°C, 0 % | 2 / 49140 MB |
[2] NVIDIA RTX A6000 | 28°C, 0 % | 2 / 49140 MB |
[3] NVIDIA RTX A6000 | 30°C, 0 % | 2 / 49140 MB |
[4] NVIDIA RTX A6000 | 30°C, 0 % | 2 / 49140 MB |
[5] NVIDIA RTX A6000 | 32°C, 0 % | 2 / 49140 MB |
[6] NVIDIA RTX A6000 | 31°C, 0 % | 2 / 49140 MB |
[7] NVIDIA RTX A6000 | 33°C, 0 % | 2 / 49140 MB |
And I amable to confirm that when running the new script, the memory usage on each GPU was 0.
I don’t understand why this happens. Logically, using DDP on a GPU shouldn’t have any effect?