Why DDP can reduce the gpu memory usage when use it in one GPU?

I know this sounds a bit strange, I will provide a reproducible script.
My package version is as follows:

numpy == 1.26.4 
torch == 2.3.0+cu118
deepspeed == 0.14.2 
transformers == 4.35.2

First we wrap the model initialized by deepspeed with DDP, and then use it to infer data of shape (150, 35).

import numpy as np
import torch

import deepspeed
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoConfig, AutoModelForCausalLM


def setup(rank, world_size, port):
    dist.init_process_group("nccl", 
                            rank=rank, 
                            world_size=world_size,
                            init_method=f'tcp://127.0.0.1:{port}'
                            )
    torch.cuda.set_device(rank)

def inference(rank):
    setup(rank, world_size=1, port=12355)
    config = AutoConfig.from_pretrained("hugg_models/llama-7b")
    llm = AutoModelForCausalLM.from_pretrained("hugg_models/llama-7b", 
                                                 low_cpu_mem_usage=True,
                                                 config=config)

    ds_model = deepspeed.init_inference(
        llm,
        tensor_parallel={"tp_size": 1},
        dtype=torch.bfloat16,
        replace_method="auto",
        replace_with_kernel_inject=True
    )
    llm = DDP(ds_model.module, device_ids=[rank], output_device=rank)


    input_ids= torch.from_numpy(np.random.randint(3, 10000, size=(150, 35))).to(rank)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
        output_ids = llm.module.generate(
            input_ids,
            attention_mask=attention_mask,
            do_sample=True,
            temperature=0.2,
            bos_token_id=1,
            eos_token_id=2,
            pad_token_id=0,
            max_new_tokens=37,
            use_cache=True,
            )
    
if __name__ == "__main__":
    world_size = 1
    port = 12355
    inference(rank=0)
    pass

When running the above code, we can view the GPU memory usage as below.

[0] NVIDIA RTX A6000 | 37°C,   0 % | 38260 / 49140 MB |
[1] NVIDIA RTX A6000 | 31°C,   0 % |     2 / 49140 MB |
[2] NVIDIA RTX A6000 | 28°C,   0 % |     2 / 49140 MB |
[3] NVIDIA RTX A6000 | 30°C,   0 % |     2 / 49140 MB |
[4] NVIDIA RTX A6000 | 30°C,   0 % |     2 / 49140 MB |
[5] NVIDIA RTX A6000 | 32°C,   0 % |     2 / 49140 MB |
[6] NVIDIA RTX A6000 | 30°C,   0 % |     2 / 49140 MB |
[7] NVIDIA RTX A6000 | 33°C,   0 % |     2 / 49140 MB |

=======================================
Next, I modified the code slightly to remove the use of DDP

import numpy as np
import torch

import deepspeed
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoConfig, AutoModelForCausalLM


def setup(rank, world_size, port):
    dist.init_process_group("nccl", 
                            rank=rank, 
                            world_size=world_size,
                            init_method=f'tcp://127.0.0.1:{port}'
                            )
    torch.cuda.set_device(rank)

def inference(rank):
    setup(rank, world_size=1, port=12355)
    config = AutoConfig.from_pretrained("hugg_models/llama-7b")
    llm = AutoModelForCausalLM.from_pretrained("hugg_models/llama-7b", 
                                                 low_cpu_mem_usage=True,
                                                 config=config)

    ds_model = deepspeed.init_inference(
        llm,
        tensor_parallel={"tp_size": 1},
        dtype=torch.bfloat16,
        replace_method="auto",
        replace_with_kernel_inject=True
    )
    llm = ds_model.module

    input_ids= torch.from_numpy(np.random.randint(3, 10000, size=(150, 35))).to(rank)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
    with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
        output_ids = llm.generate(  # repetition_penalty
            input_ids,
            attention_mask=attention_mask,
            do_sample=True,
            temperature=0.2,
            bos_token_id=1,
            eos_token_id=2,
            pad_token_id=0,
            max_new_tokens=37,
            use_cache=True,
            )
    
if __name__ == "__main__":
    world_size = 1
    port = 12355
    inference(rank=0)
    pass

Running the code, we get the following error:

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/transformers/generation/utils.py", line 1719, in generate
    return self.sample(
  File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/transformers/generation/utils.py", line 2801, in sample
    outputs = self(
  File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "xxx/miniconda3/envs/xxx/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py", line 1053, in forward
    logits = logits.float()
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 642.00 MiB. GPU

At this time, the GPU memory usage is

[0] NVIDIA RTX A6000 | 35°C,   0 % | 48319 / 49140 MB |
[1] NVIDIA RTX A6000 | 31°C,   0 % |     2 / 49140 MB |
[2] NVIDIA RTX A6000 | 28°C,   0 % |     2 / 49140 MB |
[3] NVIDIA RTX A6000 | 30°C,   0 % |     2 / 49140 MB |
[4] NVIDIA RTX A6000 | 30°C,   0 % |     2 / 49140 MB |
[5] NVIDIA RTX A6000 | 32°C,   0 % |     2 / 49140 MB |
[6] NVIDIA RTX A6000 | 31°C,   0 % |     2 / 49140 MB |
[7] NVIDIA RTX A6000 | 33°C,   0 % |     2 / 49140 MB |

And I amable to confirm that when running the new script, the memory usage on each GPU was 0.

I don’t understand why this happens. Logically, using DDP on a GPU shouldn’t have any effect?

What is deepspeed doing in your code? Is it reducing the memory usage and if so, is it enabled only for multi-GPU runs?

Sorry, due to the problem with my email, I received the message very late.
I use deepspeed.init_inference to speed up the inference of LLM. And it does reduce memory usage. I always run this code on only one GPU.

Could you update your post to avoid using a HF token so that we can reproduce it?