DistributedDataParallel behaves weirdly

Hi,

to speed up my training I was looking into pytorches DistributedDataParallel, since the docs state that DataParallel has a lot of overhead which reduces the speed. I tested a couple of hyperparameters and found weird behavior, which left me wondering if I oversaw something.

I am running on a linux-64 bit cluster node with 64 cores, 350+ GB of ram and 4 Nvidia Tesla V100 (16 GB). I tested the stable 1.5.0 version and nightly 1.7.0.dev20200720 because I wanted to use automated mixed precision as another speed up.

The model I was testing which is a BERT model from the transformer library, with a single linear layer and a BCEWithLogitsLoss.

I tested three different training modes (all single machine): 1. a single GPU. 2. multi GPU with DataParallel. 3. multi GPU with DistributedDataParallel.
Then I tested memory_pin, num_workers for the dataloader and mixed precision if possible.

code for reference:

import os
from datetime import datetime
from argparse import ArgumentParser
import torch
import torch.multiprocessing as mp
import torch.distributed as dist
from transformers import AdamW, BertConfig

from prediction_module import path_vocab, path_raw_uniprot
from prediction_module.protein_datasets import ProteinBertLabeledDataset
from prediction_module.helpers import get_logger

logger = get_logger(__file__)


def train_model_dp(dataset, batch_size=4, n_steps=1000, num_workers=0, parallel=True, mixed_pres=False, pin_memory=False):
    from prediction_module.protein_models import ProteinBertForMultiLabel
    if mixed_pres:
        ProteinBertForMultiLabel.forward = torch.cuda.amp.autocast()(ProteinBertForMultiLabel.forward)
    torch.manual_seed(0)
    config = BertConfig(
        vocab_size=dataset.tokenizer.vocab_size,
        num_labels=dataset.num_labels,
        max_position_embeddings=dataset.input_size
    )
    model = ProteinBertForMultiLabel(config)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if torch.cuda.device_count() > 1 and parallel:
        batch_size = batch_size * torch.cuda.device_count()
        model = torch.nn.DataParallel(model)

    logger.debug(f"testing: {batch_size=} {num_workers=} {parallel=} {mixed_pres=} {pin_memory=}")
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             collate_fn=dataset.collate_fn,
                                             shuffle=False,
                                             num_workers=num_workers,
                                             pin_memory=pin_memory)

    model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=1e-5)  # create optimizer
    if mixed_pres:
        scaler = torch.cuda.amp.GradScaler()
    start = datetime.now()

    for epoch in range(1):  # loop over the dataset multiple times
        for i, inputs in enumerate(dataloader):
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device, non_blocking=True)
            # zero the parameter gradients
            optimizer.zero_grad()
            if mixed_pres:
                with torch.cuda.amp.autocast():
                    outputs = model(**inputs)
                    loss = outputs[0]
                    loss = loss.mean()
                # Backward and optimize
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                # forward + backward + optimize
                outputs = model(**inputs)
                loss = outputs[0]
                loss = loss.mean()
                loss.backward()
                optimizer.step()
            if i >= n_steps:
                break
    logger.debug("Training complete in: %s. normalized by batch size: %s", str(datetime.now() - start), str((datetime.now() - start) / batch_size))

def train_start(rank, world_size, batch_size=4, mixed_pres=False, pin_memory=True, num_workers=0, n_steps=1000, epochs=1):
    from prediction_module.protein_models import ProteinBertForMultiLabel
    if mixed_pres:
        ProteinBertForMultiLabel.forward = torch.cuda.amp.autocast()(ProteinBertForMultiLabel.forward)
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.manual_seed(0)
    torch.cuda.set_device(rank)
    dataset = ProteinBertLabeledDataset(
        vocab=path_vocab,
        csv_path=os.path.join(path_raw_uniprot, "raw_data.csv"),
        h5_path=os.path.join(path_raw_uniprot, "metled_go_data.h5")
    )
    config = BertConfig(
        vocab_size=dataset.tokenizer.vocab_size,
        num_labels=dataset.num_labels,
        max_position_embeddings=dataset.input_size
    )
    model = ProteinBertForMultiLabel(config)
    model.cuda(rank)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank,
                                                      find_unused_parameters=True)
    model.train()
    optimizer = AdamW(model.parameters(), lr=1e-5)  # create optimizer
    if mixed_pres:
        scaler = torch.cuda.amp.GradScaler()
    # Data loading code
    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    train_loader = torch.utils.data.DataLoader(dataset=dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=num_workers,
                                               pin_memory=pin_memory,
                                               sampler=train_sampler,
                                               collate_fn=dataset.collate_fn)

    start = datetime.now()
    for epoch in range(epochs):
        for i, inputs in enumerate(train_loader):
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.cuda(rank, non_blocking=True)
            optimizer.zero_grad()
            if mixed_pres:
                with torch.cuda.amp.autocast():
                    outputs = model(**inputs)
                    loss = outputs[0]
                    loss = loss.mean()
                # Backward and optimize
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(**inputs)
                loss = outputs[0]
                loss = loss.mean()
                loss.backward()
                optimizer.step()
            if i >= n_steps:
                break
    if rank == 0:
        logger.debug("Training complete in: %s", str(datetime.now() - start))
    dist.destroy_process_group()


def train_model_ddp(world_size=4, mixed_pres=False, batch_size=4, pin_memory=False, num_workers=0, n_steps=1000):
    logger.debug(f"testing: {batch_size=} {num_workers=} {mixed_pres=} {pin_memory=}")
    mp.spawn(train_start,
             args=(world_size, batch_size, mixed_pres, pin_memory, num_workers, n_steps),
             nprocs=world_size,
             join=True)

if __name__ == "__main__":
    try:
        from torch.cuda.amp import autocast
        mp_avail = True
    except ImportError:
        mp_avail = False
    parser = ArgumentParser()
    parser.add_argument("--test-dp", dest="test_dp", default=False, const=True, nargs="?")
    parser.add_argument("--test-ddp", dest="test_ddp", default=False, const=True, nargs="?")
    args = parser.parse_args()
    args_dict = vars(args)
    logger.debug("torch version: %s", torch.__version__)
    if args_dict["test_dp"]:
        dataset = ProteinBertLabeledDataset(
            vocab=path_vocab,
            csv_path=os.path.join(path_raw_uniprot, "raw_data.csv"),
            h5_path=os.path.join(path_raw_uniprot, "metled_go_data.h5")
        )
        logger.debug("testing single gpu")
        train_model_dp(dataset, parallel=False)
        train_model_dp(dataset, parallel=False)
        if mp_avail:
            train_model_dp(dataset, parallel=False, mixed_pres=True)
        train_model_dp(dataset, parallel=False, num_workers=8)
        train_model_dp(dataset, parallel=False, num_workers=16)
        train_model_dp(dataset, parallel=False, pin_memory=True)
        logger.debug("testing dp")
        train_model_dp(dataset)
        train_model_dp(dataset, num_workers=8)
        train_model_dp(dataset, num_workers=16)
        train_model_dp(dataset, pin_memory=True)
        if mp_avail:
            train_model_dp(dataset, mixed_pres=True)
    if args_dict["test_ddp"]:
        logger.debug("testing ddp")
        train_model_ddp()
        train_model_ddp(pin_memory=True)
        train_model_ddp(num_workers=8)
        train_model_ddp(num_workers=16)
        if mp_avail:
            train_model_ddp(mixed_pres=True)

The results:

testing single gpu
torch version: 1.5.0
testing: batch_size=4 num_workers=0 parallel=False mixed_pres=False pin_memory=False
Training complete in: 0:02:48.407579. normalized by batch size: 0:00:42.101900
testing: batch_size=4 num_workers=0 parallel=False mixed_pres=False pin_memory=False
Training complete in: 0:02:47.146963. normalized by batch size: 0:00:41.786745
testing: batch_size=4 num_workers=8 parallel=False mixed_pres=False pin_memory=False
Training complete in: 0:02:49.422436. normalized by batch size: 0:00:42.355613
testing: batch_size=4 num_workers=16 parallel=False mixed_pres=False pin_memory=False
Training complete in: 0:02:50.284026. normalized by batch size: 0:00:42.571010
testing: batch_size=4 num_workers=0 parallel=False mixed_pres=False pin_memory=True
Training complete in: 0:02:47.878925. normalized by batch size: 0:00:41.969736
testing dp
testing: batch_size=16 num_workers=0 parallel=True mixed_pres=False pin_memory=False
Training complete in: 0:05:32.129513. normalized by batch size: 0:00:20.758095
testing: batch_size=16 num_workers=8 parallel=True mixed_pres=False pin_memory=False
Training complete in: 0:05:28.702392. normalized by batch size: 0:00:20.543900
testing: batch_size=16 num_workers=16 parallel=True mixed_pres=False pin_memory=False
Training complete in: 0:05:29.794879. normalized by batch size: 0:00:20.612181
testing: batch_size=16 num_workers=0 parallel=True mixed_pres=False pin_memory=True
Training complete in: 0:05:24.955569. normalized by batch size: 0:00:20.309724

torch version: 1.7.0.dev20200720
testing single gpu
testing: batch_size=4 num_workers=0 parallel=False mixed_pres=False pin_memory=False
Training complete in: 0:02:50.061025. normalized by batch size: 0:00:42.515261
testing: batch_size=4 num_workers=0 parallel=False mixed_pres=False pin_memory=False
Training complete in: 0:02:48.032688. normalized by batch size: 0:00:42.008176
testing: batch_size=4 num_workers=0 parallel=False mixed_pres=True pin_memory=False
Training complete in: 0:01:54.984463. normalized by batch size: 0:00:28.746120
testing: batch_size=4 num_workers=8 parallel=False mixed_pres=False pin_memory=False
Training complete in: 0:02:50.344483. normalized by batch size: 0:00:42.586124
testing: batch_size=4 num_workers=16 parallel=False mixed_pres=False pin_memory=False
Training complete in: 0:02:51.148356. normalized by batch size: 0:00:42.787092
testing: batch_size=4 num_workers=0 parallel=False mixed_pres=False pin_memory=True
Training complete in: 0:02:48.677086. normalized by batch size: 0:00:42.169276
testing dp
testing: batch_size=16 num_workers=0 parallel=True mixed_pres=False pin_memory=False
Training complete in: 0:05:30.977989. normalized by batch size: 0:00:20.686125
testing: batch_size=16 num_workers=8 parallel=True mixed_pres=False pin_memory=False
Training complete in: 0:05:26.893676. normalized by batch size: 0:00:20.430856
testing: batch_size=16 num_workers=16 parallel=True mixed_pres=False pin_memory=False
Training complete in: 0:05:28.139827. normalized by batch size: 0:00:20.508740
testing: batch_size=16 num_workers=0 parallel=True mixed_pres=False pin_memory=True
Training complete in: 0:05:22.767213. normalized by batch size: 0:00:20.172952
testing: batch_size=16 num_workers=0 parallel=True mixed_pres=True pin_memory=False
Training complete in: 0:04:26.452442. normalized by batch size: 0:00:16.653278

torch version: 1.5.0
testing ddp
testing: batch_size=4 num_workers=0 mixed_pres=False pin_memory=False
Training complete in: 0:04:59.752312
testing: batch_size=4 num_workers=0 mixed_pres=False pin_memory=True
Training complete in: 0:04:59.236787
testing: batch_size=4 num_workers=8 mixed_pres=False pin_memory=False
Training complete in: 0:12:16.935697

torch version: 1.7.0.dev20200720
testing ddp
testing: batch_size=4 num_workers=0 mixed_pres=False pin_memory=False
Training complete in: 0:05:02.979028
testing: batch_size=4 num_workers=0 mixed_pres=False pin_memory=True
Training complete in: 0:05:03.088308
testing: batch_size=4 num_workers=8 mixed_pres=False pin_memory=False
Training complete in: 0:11:05.255453
testing: batch_size=4 num_workers=0 mixed_pres=True pin_memory=False
Training complete in: 0:05:10.881854

My interpretation
Training on a single GPU takes about 2:50 minutes for all parameters except mixed precision, which increases speed to around 2 minutes.
So perfect parallelization would mean that the same time would be required with 4 GPUs if every single GPU gets a mini-batch with size 4, correct?
DataParallel seems to behave very similar to the hyperparameters, training takes around 5:25 mintues, except for mixed precision, which decreases it to 4:25 minutes.

Now to DistributedDataParallel:
Increasing the number of workers seems to slow down training by a lot.
Mixed precision has no effect on training speed (even though I observed on the GPUs that the required ram was decreased compared to not using it, and similar to the ram required for the mixed precision during DataParallel).

This is the first time using pytorch, so if I oversaw anything please let me know. Otherwise I would be interested what caused these effects.

Hello, lets get to your points one by one

  1. DDP and DP is slow

My interpretation
Training on a single GPU takes about 2:50 minutes for all parameters except mixed precision, which increases speed to around 2 minutes.
So perfect parallelization would mean that the same time would be required with 4 GPUs if every single GPU gets a mini-batch with size 4, correct?
DataParallel seems to behave very similar to the hyperparameters, training takes around 5:25 mintues, except for mixed precision, which decreases it to 4:25 minutes.

Now to DistributedDataParallel:
Increasing the number of workers seems to slow down training by a lot.
Mixed precision has no effect on training speed (even though I observed on the GPUs that the required ram was decreased compared to not using it, and similar to the ram required for the mixed precision during DataParallel).

Your interpretation is correct, indeed, however, in python, whether you use thread based parallelism or process based parallelism, you are faced with extremely high costs:

  1. for DP, it is mainly GIL cost
  2. for DDP, is is mainly process communication cost.
    The best way to increase efficiency is “batching”, because that part all happens in C/C++/CUDA domain, therefore, in order to fully display the power of DDP, you must make sure that your model is dealling with about 100800800*3 size of data (about 100 frames of images), even using this much data in a forward process on ResNet probably would take less than a second on powerful GPUs (eg: V100, your GPU), I am not sure how big your model prediction_module.protein_models it is, if it is not large enough, then batch=4 (16/4=4) per process probably is too small, and overhead of inter-process communication would demonstrate its annoying existence in this condition.

More workers, more slowly, it is true in python, whether you are using threads or processes, unless there is no communication overhead (for thread, it is GIL, for process, it is repeated serialization & deserialization, inter-process synchronization, etc.)

DistributedSampler in this case could also be a major overhead contributor, since internally the DataLoader will use _MultiProcessingDataLoaderIter, which uses a inter process queue to get data from sub-processes, if your data is not big enough and number is large, then it is very likely that repeated serialization-deserialization would contribute a lot to the slowiness, because for cpu tensors, they have to be moved to the shared memory first, then the handle will be serialized, there is no way for you to avoid this nightmare, if you are using the inbuit datasampler.

It is possible for you to customize your own dataloader, maybe load all data at once to your GPUs, (I believe V100 has enough memory to hold all of them, and you should have multiple V100s), however, it would require a huge quantity of time to debug your impementation.

In summary:
Maybe…, you should increase your batch size.
Live with this.
Make your own implementation.

Thanks for the answer!

I was just testing some bottlenecks in my code an it indeed seems like the DistributedSampler is a major culprit, when used with shuffle=True. This might be because the dataset I was using in the tests has a length of 320 million samples. And it might also be the reason why I wasnt using bigger batch sizes because 10 GB of RAM seems to be occupied by the DistributedSampler.

So I decided to use the sampler on a single GPU and look at the effects.

code:

import torch
import os
from transformers import AdamW, BertConfig, TrainingArguments, Trainer
from datetime import datetime

from prediction_module import path_vocab, path_storage
from prediction_module.protein_datasets import ProteinBertMaskedLMDataset
from prediction_module.protein_models import ProteinBertForMaskedLM


def train_single(dist_sampler=False, n_steps=100, shuffle=False):
    rank = 0
    dataset = ProteinBertMaskedLMDataset(
        path_vocab, os.path.join(path_storage, "data", "uniparc", "uniparc_train_sorted.h5"),
    )
    config = BertConfig(
        vocab_size=dataset.tokenizer.vocab_size,
        max_position_embeddings=dataset.input_size
    )
    model = ProteinBertForMaskedLM(config)
    model.cuda(rank)
    model.train()

    optimizer = AdamW(model.parameters(), lr=1e-5)  # create optimizer
    # Data loading code

    sampler = DistributedSampler(dataset, 1, rank, shuffle=shuffle) if dist_sampler else None
    loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, pin_memory=False,
                                         sampler=sampler, collate_fn=dataset.collate_fn)
    print("start trainig")
    start = datetime.now()
    for epoch in range(1):
        for i, inputs in enumerate(loader):
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.cuda(rank, non_blocking=True)
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = outputs[0]
            loss = loss.mean()
            loss.backward()
            optimizer.step()
            if i >= n_steps:
                break
    print("Training complete in:", str(datetime.now() - start))


if __name__ == "__main__":
    train_single()
    train_single(True)
    train_single(True, shuffle=True)

The results:

start trainig
Training complete in: 0:00:07.233262
start trainig
Training complete in: 0:00:19.662416
start trainig
Training complete in: 0:03:29.437496

Additionally, the RAM on the GPU required for the first two function calls is about 3,5 GB and for the last one about 14 GB. I am not sure what is going on here, but that seems very weird.

I think the data loader might have cached something resulting in that 10.5 GB extra memory, I am not very familiar with its internal design so this answer might be wrong.

Anyway, you have 350GB+ memory, why worry about that?

Because the memory is allocated on the GPU. And 10.5 GB random allocation on the GPU is not that nice. (edited the previous post to make that more clear)

Weird, seems that multiple processes have occupied your GPU, could you please post the result of nvidia-smi ?

Here is the output:
during the first two runs it looks like this:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-PCIE...  Off  | 00000000:18:00.0 Off |                    0 |
| N/A   44C    P0    38W / 250W |   2857MiB / 16160MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   43C    P0    27W / 250W |     12MiB / 16160MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-PCIE...  Off  | 00000000:86:00.0 Off |                    0 |
| N/A   41C    P0    27W / 250W |     12MiB / 16160MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   3  Tesla V100-PCIE...  Off  | 00000000:AF:00.0 Off |                    0 |
| N/A   41C    P0    28W / 250W |     12MiB / 16160MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0    284149      C   python                                      2845MiB |
+-----------------------------------------------------------------------------+

During the last run:

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-PCIE...  Off  | 00000000:18:00.0 Off |                    0 |
| N/A   46C    P0    43W / 250W |  14527MiB / 16160MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   42C    P0    27W / 250W |     12MiB / 16160MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-PCIE...  Off  | 00000000:86:00.0 Off |                    0 |
| N/A   41C    P0    27W / 250W |     12MiB / 16160MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
|   3  Tesla V100-PCIE...  Off  | 00000000:AF:00.0 Off |                    0 |
| N/A   41C    P0    28W / 250W |     12MiB / 16160MiB |      0%   E. Process |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0    284149      C   python                                     14515MiB |
+-----------------------------------------------------------------------------+

Hmmmmmmmmmmmmm, @mrshenli

Thanks @iffiX for covering distributed training questions!

@siheming I wonder if those are cached blocks. Can you print memory summary using torch.cuda.memory_summary?

when should I call it ? as soon as the memory is filled or after training? or does it not matter?

Also I wanted to come back to this question because I have not seen a satisfying answer yet or understood why this might be the case:

It depends on when do you wants to inspect the memory usage. It prints current cached memory, allocated memory, etc. I would try to print it every few iterations.

Mixed precision has no effect on training speed (even though I observed on the GPUs that the required ram was decreased compared to not using it, and similar to the ram required for the mixed precision during DataParallel).

Are you using PyTorch v1.6+? I saw the DDP + AMP example is only available in v1.6+ docs:

https://pytorch.org/docs/1.5.0/notes/amp_examples.html
https://pytorch.org/docs/master/notes/amp_examples.html

cc the author of torch.cuda.amp @mcarilli

I’ll get on that tomorrow and post the results.

yes I tested both 1.5 and 1.7, and saw speed up when using 1.7 with AMP in both single GPU mode and DataParallel mode, but not in DistributedDataParallel.

Should I @ him aswell or is yours sufficient?

1 Like

I adjusted the code like to print before during and after training:

    print(torch.cuda.memory_summary())
    sampler = torch.utils.data.DistributedSampler(dataset, 1, rank, shuffle=shuffle) if dist_sampler else None
    loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, pin_memory=False,
                                         sampler=sampler, collate_fn=dataset.collate_fn)
    print("start trainig")
    start = datetime.now()
    for epoch in range(1):
        for i, inputs in enumerate(loader):
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.cuda(rank, non_blocking=True)
            if i % 10 == 0:
                print(torch.cuda.memory_summary())
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = outputs[0]
            loss = loss.mean()
            loss.backward()
            optimizer.step()
            if i >= n_steps:
                break
    print("Training complete in:", str(datetime.now() - start))
    print(torch.cuda.memory_summary())

and then called the function three times again like before:


    train_single()
    train_single(dist_sampler=True)
    train_single(dist_sampler=True, shuffle=True)

This is what happens during the third function call (DistributedSampler with shuffle=True). The other outputs did not show any irregularities so i’ll leave them out (and due to the character limit…). First output is before the dataloader is build, second print is after the first Iteration, third after the tenth iteration

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  340029 KB |    1692 MB |  350966 MB |  350634 MB |
|       from large pool |  339456 KB |    1677 MB |  321965 MB |  321633 MB |
|       from small pool |     573 KB |     133 MB |   29001 MB |   29001 MB |
|---------------------------------------------------------------------------|
| Active memory         |  340029 KB |    1692 MB |  350966 MB |  350634 MB |
|       from large pool |  339456 KB |    1677 MB |  321965 MB |  321633 MB |
|       from small pool |     573 KB |     133 MB |   29001 MB |   29001 MB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |    1842 MB |    1842 MB |    1842 MB |       0 B  |
|       from large pool |    1700 MB |    1700 MB |    1700 MB |       0 B  |
|       from small pool |     142 MB |     142 MB |     142 MB |       0 B  |
|---------------------------------------------------------------------------|
| Non-releasable memory |   51138 KB |  215041 KB |  393608 MB |  393558 MB |
|       from large pool |   49664 KB |  186288 KB |  362929 MB |  362880 MB |
|       from small pool |    1474 KB |   36244 KB |   30679 MB |   30678 MB |
|---------------------------------------------------------------------------|
| Allocations           |     204    |    1077    |  251092    |  250888    |
|       from large pool |      75    |     459    |  126113    |  126038    |
|       from small pool |     129    |     756    |  124979    |  124850    |
|---------------------------------------------------------------------------|
| Active allocs         |     204    |    1077    |  251092    |  250888    |
|       from large pool |      75    |     459    |  126113    |  126038    |
|       from small pool |     129    |     756    |  124979    |  124850    |
|---------------------------------------------------------------------------|
| GPU reserved segments |     156    |     156    |     156    |       0    |
|       from large pool |      85    |      85    |      85    |       0    |
|       from small pool |      71    |      71    |      71    |       0    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |      20    |     177    |  133360    |  133340    |
|       from large pool |      19    |      80    |   87375    |   87356    |
|       from small pool |       1    |      98    |   45985    |   45984    |
|===========================================================================|

start trainig
|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  340079 KB |    1692 MB |  350966 MB |  350634 MB |
|       from large pool |  339456 KB |    1677 MB |  321965 MB |  321633 MB |
|       from small pool |     623 KB |     133 MB |   29001 MB |   29001 MB |
|---------------------------------------------------------------------------|
| Active memory         |  340079 KB |    1692 MB |  350966 MB |  350634 MB |
|       from large pool |  339456 KB |    1677 MB |  321965 MB |  321633 MB |
|       from small pool |     623 KB |     133 MB |   29001 MB |   29001 MB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |    1842 MB |    1842 MB |    1842 MB |       0 B  |
|       from large pool |    1700 MB |    1700 MB |    1700 MB |       0 B  |
|       from small pool |     142 MB |     142 MB |     142 MB |       0 B  |
|---------------------------------------------------------------------------|
| Non-releasable memory |   51089 KB |  215041 KB |  393608 MB |  393558 MB |
|       from large pool |   49664 KB |  186288 KB |  362929 MB |  362880 MB |
|       from small pool |    1425 KB |   36244 KB |   30679 MB |   30678 MB |
|---------------------------------------------------------------------------|
| Allocations           |     207    |    1077    |  251095    |  250888    |
|       from large pool |      75    |     459    |  126113    |  126038    |
|       from small pool |     132    |     756    |  124982    |  124850    |
|---------------------------------------------------------------------------|
| Active allocs         |     207    |    1077    |  251095    |  250888    |
|       from large pool |      75    |     459    |  126113    |  126038    |
|       from small pool |     132    |     756    |  124982    |  124850    |
|---------------------------------------------------------------------------|
| GPU reserved segments |     156    |     156    |     156    |       0    |
|       from large pool |      85    |      85    |      85    |       0    |
|       from small pool |      71    |      71    |      71    |       0    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |      20    |     177    |  133360    |  133340    |
|       from large pool |      19    |      80    |   87375    |   87356    |
|       from small pool |       1    |      98    |   45985    |   45984    |
|===========================================================================|

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    1328 MB |   11131 MB |  494742 MB |  493413 MB |
|       from large pool |    1326 MB |   11128 MB |  465663 MB |  464337 MB |
|       from small pool |       2 MB |     133 MB |   29078 MB |   29075 MB |
|---------------------------------------------------------------------------|
| Active memory         |    1328 MB |   11131 MB |  494742 MB |  493413 MB |
|       from large pool |    1326 MB |   11128 MB |  465663 MB |  464337 MB |
|       from small pool |       2 MB |     133 MB |   29078 MB |   29075 MB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   13646 MB |   13646 MB |   13646 MB |       0 B  |
|       from large pool |   13504 MB |   13504 MB |   13504 MB |       0 B  |
|       from small pool |     142 MB |     142 MB |     142 MB |       0 B  |
|---------------------------------------------------------------------------|
| Non-releasable memory |  158950 KB |    3717 MB |  474414 MB |  474259 MB |
|       from large pool |  157520 KB |    3717 MB |  443648 MB |  443494 MB |
|       from small pool |    1430 KB |      35 MB |   30766 MB |   30765 MB |
|---------------------------------------------------------------------------|
| Allocations           |     816    |    1077    |  265399    |  264583    |
|       from large pool |     297    |     496    |  134867    |  134570    |
|       from small pool |     519    |     756    |  130532    |  130013    |
|---------------------------------------------------------------------------|
| Active allocs         |     816    |    1077    |  265399    |  264583    |
|       from large pool |     297    |     496    |  134867    |  134570    |
|       from small pool |     519    |     756    |  130532    |  130013    |
|---------------------------------------------------------------------------|
| GPU reserved segments |     297    |     297    |     297    |       0    |
|       from large pool |     226    |     226    |     226    |       0    |
|       from small pool |      71    |      71    |      71    |       0    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |     109    |     191    |  140979    |  140870    |
|       from large pool |      74    |     154    |   92152    |   92078    |
|       from small pool |      35    |      98    |   48827    |   48792    |
|===========================================================================|

@mrshenli, I think I figured out what is going on. The DataLoader I am using has dynamic clipping and the inputs are sorted by input length. So without shuffling only small inputs are used (in the ballpark of shape(4, 100)), while with shuffling much larger inputs are being used (around shape(4,1000)). And I would guess this could also explain some of the speed difference in training.

However, I am still slightly confused why a SequentialSampler is so much faster than a DistributedSampler with num_replicates=1 and shuffling=False.

So I guess only my question about amp is left.

If you’re strongly dataloader bound in the DDP case, any Amp speedup may be negligible/not observable.

One simple thing you can try is, don’t use the dataloader at all. In each DDP process, create a single dummy batch of data and use that through all the timing iterations.

First try it without amp, which gives an idea how strongly dataloader bound you are overall. For example, if switching to dummy data gives a 2X speedup right away, you know the dataloader is a big bottleneck.

Then try it with amp, and see if you observe a speedup relative to the above dummy data+no amp case, which gives an idea if Amp is working.

1 Like

Thanks for the suggestion I will try that. Is there some rough number on how fast my dataloder should be in seconds or relative to the model?

Also thanks to everyone for helping out and explaining!

Here are the results of the test. I tested the native speed of my model, then the speed with dummy data and then dummy data and Amp:

testing ddp
testing: model=mlm batch_size=4 num_workers=0 mixed_pres=False pin_memory=False use_dummy_data=False
Training complete in: 0:03:47.287354
model=mlm batch_size=4 num_workers=0 mixed_pres=False pin_memory=False use_dummy_data=True
Training complete in: 0:02:40.539612
testing: model=mlm batch_size=4 num_workers=0 mixed_pres=True pin_memory=False use_dummy_data=True
Training complete in: 0:02:41.635868

so while my dataloader seems to slow down the training quite a bit I still see no speedup using Amp.

I very lazily updated my training loop to:

    for epoch in range(epochs):
        if not use_dummy_data:
            for i, inputs in enumerate(train_loader):
                for k, v in inputs.items():
                    if isinstance(v, torch.Tensor):
                        inputs[k] = v.cuda(rank, non_blocking=True)
                optimizer.zero_grad()
                if mixed_pres:
                    with torch.cuda.amp.autocast():
                        outputs = model(**inputs)
                        loss = outputs[0]
                        loss = loss.mean()
                    # Backward and optimize
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(**inputs)
                    loss = outputs[0]
                    loss = loss.mean()
                    loss.backward()
                    optimizer.step()
                if i >= n_steps:
                    break
        else:
            inputs = next(enumerate(train_loader))[1]
            inputs.to(f"cuda:{rank}")
            for i in range(n_steps):
                optimizer.zero_grad()
                if mixed_pres:
                    with torch.cuda.amp.autocast():
                        outputs = model(**inputs)
                        loss = outputs[0]
                        loss = loss.mean()
                    # Backward and optimize
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(**inputs)
                    loss = outputs[0]
                    loss = loss.mean()
                    loss.backward()
                    optimizer.step()