Pytorch DDP having memory issues

Hi,

I have been trying to use pytorch to train a model on a single GPU for past few days, however, the data is huge, and I am only able to train a small model with a batch size of 2 in the GPU memory. The GPU memory is around 45 GB in size (both data and model are 3D).

Recently I included another GPU of 45GB, and since then I am trying to use DDP for training my model on both GPUs. But now, it turns out that the same model, with the same parameters and hyperparameters and a batch size of 1, can’t even fit in the combined 90 GB of GPU memory. Whenever I try to run it with python3, it gives an out of memory error, and when I tried to run it with torchrun, it crashed my system.

After spending 2 days on the internet, I still could not get answer to this, and rather found other people’s post facing the same issue (for eg: Memory issue when training in DDP mode · Lightning-AI/pytorch-lightning · Discussion #18525 · GitHub). As per my understanding, there is no error in my code, as I have seen people using the same commands in different articles and tutorials for implementing DDP.

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # Initialize the distributed process group
    dist.init_process_group(
        backend='nccl',  # nccl is preferred for GPUs
        # init_method='env://',  # Initialization method (can be modified for multi-node)
        world_size=world_size,
        rank=rank
    )
    torch.cuda.set_device(rank)  # Set the device for this process (GPU)

def cleanup():
    dist.destroy_process_group()


def train(rank, world_size):

    setup(rank, world_size)

    binary_model = binary_model.to(rank)
    dummy_input = torch.randn(inp_shape).to(rank)
    _ = binary_model(dummy_input)
    binary_model = DDP(binary_model, device_ids=[rank])


    # Dataloader for our training and validation set
    train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank, shuffle=False, drop_last=False)
    train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=num_workers, drop_last=False, sampler=train_sampler)

    val_sampler = DistributedSampler(val_dataset, num_replicas=world_size, rank=rank, shuffle=False, drop_last=False)
    val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=num_workers, drop_last=False, sampler=val_sampler)


    # gc.collect()
    # torch.cuda.empty_cache()

    loss1 = nn.CrossEntropyLoss()

    optimizer = optim.Adam(binary_model.parameters(), lr=0.0002, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=5, verbose=True)


    # Training loop

    best_val_loss = np.inf    # Initializing the validation loss value with infinity
    best_model = None         # Variable for collecting the best model (with the lowest validation loss)

    num_epochs = 80     # Number of epochs

    save_epochs = 10    # Saving the model after every 'save_epochs' number of epochs

    for epoch in range(num_epochs):
        print("-" * 10)
        print("Epoch:", epoch+1)

        train_sampler.set_epoch(epoch)

        binary_model.train()          # Go to training mode

        # Training
        
        for img, lab in tqdm.tqdm(train_dl):
            
            img, lab = img.float().to(rank, non_blocking=True), lab.to(rank, non_blocking=True)

            optimizer.zero_grad()         # Make the gradients 0
            output = binary_model(img)    # Predict

            output_loss1 = loss1(output, lab)                           # Calculate crossentropy loss

            output_loss1.backward()                     # Calculate the derivative with respect to each parameter
            optimizer.step()                    # Calculate the updated weights


        binary_model.eval()     # Go to evaluation mode

        val_sampler.set_epoch(epoch)

        with torch.no_grad():   # Disable any gradient calculation, accumulation, etc
            for img, lab in tqdm.tqdm(val_dl):
                img, lab = img.float().to(rank, non_blocking=True), lab.to(rank, non_blocking=True)

                output = binary_model(img)      # Make prediction

                output_loss1 = loss1(output, lab)             # Calculate crossentropy loss
                val_epoch_loss += output_loss1.item()         # Calculate per epoch loss

                pred = torch.argmax(output, dim=1)          # Get the prediction result

        scheduler.step(val_epoch_loss)        # Update scheduler, used for Reduce LR on plateau

    cleanup()

if __name__ == "__main__":
    world_size = 2  # Number of GPUs
    mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)


Can someone please suggest me what to do?

If your model is already running OOM on a single GPU with a batch size of 1, DDP won’t help since it’s creating model clones on each device and executes the training in parallel.
In such a case you might want to check model sharding approaches, such as pipeline parallel etc.

Hi. I think there is some misunderstanding. My model is running properly on a single GPU with a batch size of 2. My aim of adding another GPU is to increase the batch size. But with two GPUs and DDP, now I cannot even train my model with a batch size of 1.