Profiler doesn't capture GPU

Greetings,

I am profiling a distributed model with 3 machines, one GPU per machine.
I have this weird behavior

This command gets the “Distributed” view to appear, but the GPU is not recognized.

srun nsys profile --gpu-metrics-device=0 --sample=none --cpuctxsw=none --force-overwrite true --nic-metrics=true --trace=cuda,nvtx,cudnn --output=logs/${EXP_NAME}/nsys_logs_%h torchrun \
--nnodes $NUM_NODES \
--nproc_per_node 1 \
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
cifar10_with_resnset.py --epochs=10

While this command gets the “Distributed” view to diappear, and the GPU to be recognized.

torchrun \
--nnodes $NUM_NODES \
--nproc_per_node 1 \
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
cifar10_with_resnset.py --epochs=10

My PyTorch Code

import argparse

import torch
import torch.distributed as dist
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.cuda.amp import GradScaler, autocast
from torch.nn.parallel import DistributedDataParallel
from torch.utils import data
from torchvision import models

precisions = {
    "fp16": torch.float16,
    "fp32": torch.float32,
}


def train(train_sampler, epoch, device, train_loader, model, loss_criterion, optimizer, profiler, scaler, precision):
    train_sampler.set_epoch(epoch)
    model.train()
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        profiler.step()
        inputs, targets = inputs.cuda(device), targets.cuda(device)
        optimizer.zero_grad()

        with autocast(dtype=precision):
            outputs = model(inputs)
            loss = loss_criterion(outputs, targets)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        if batch_idx % 10 == 0:
            current_samples = batch_idx * len(inputs)
            total_samples = len(train_sampler)
            progress = 100.0 * current_samples / total_samples
            print(f'Epoch: {epoch} [{current_samples}/{total_samples} ({progress:.0f}%)]\tLoss: {loss.item():.6f}')


def validate(epoch, device, validation_loader, model, loss_criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for inputs, targets in validation_loader:
            inputs, targets = inputs.cuda(device), targets.cuda(device)
            outputs = model(inputs)
            test_loss += loss_criterion(outputs, targets).item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(targets).sum().item()

    test_loss /= len(validation_loader.dataset)
    accuracy = 100. * correct / len(validation_loader.dataset)
    print(f'\nValidation. Epoch: {epoch}, Loss: {test_loss:.4f}, Accuracy: ({accuracy:.2f}%)\n')


def cleanup():
    dist.destroy_process_group()


def get_data(rank, world_size, args):
    transform = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(32, padding=4),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    train_sampler = data.distributed.DistributedSampler(train_set, num_replicas=world_size, rank=rank)
    train_loader = data.DataLoader(train_set, sampler=train_sampler, batch_size=args.batch_size, num_workers=1,
                                   persistent_workers=True)

    validation_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
    validation_loader = data.DataLoader(validation_set, batch_size=args.batch_size, num_workers=1,
                                        persistent_workers=True)

    return train_sampler, train_loader, validation_loader


def main(args, rank, world_size):
    torch.manual_seed(0)
    train_sampler, train_loader, validation_loader = get_data(rank, world_size, args)

    device = torch.device("cuda:0")
    print(f"Rank {rank}: Using CUDA device {device}")

    model = models.resnet18(weights=None, num_classes=10)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    model = model.cuda(device)

    ddp_model = DistributedDataParallel(model)
    scaler = GradScaler()

    loss_criterion = nn.CrossEntropyLoss().cuda(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

    print(torch.distributed.is_nccl_available())
    print(torch.distributed.get_backend_config())

    with torch.profiler.profile(
            activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
            schedule=torch.profiler.schedule(wait=1, warmup=1, active=10, repeat=1),
            on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./logs/{args.exp_name}'),
            record_shapes=True,
            profile_memory=True,
            with_stack=True,
    ) as profiler:
        for epoch in range(args.epochs):
            train(train_sampler, epoch, device, train_loader, ddp_model, loss_criterion, optimizer, profiler, scaler,
                  precisions[args.precision])
            validate(epoch, device, validation_loader, ddp_model, loss_criterion)
            scheduler.step()
        cleanup()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--batch-size", type=int, default=512)
    parser.add_argument("--model", type=str, default="resnet18")
    parser.add_argument("--precision", type=str, default="fp32")
    parser.add_argument("--exp-name", type=str, default="test")
    args = parser.parse_args()

    # distributed
    torch.distributed.init_process_group(backend='nccl', init_method='env://')
    rank = torch.distributed.get_rank()
    world_size = torch.distributed.get_world_size()

    main(args, rank, world_size)

System Details:

>>> torch.__version__
'2.3.1+cu121'

>>> torch.cuda.is_available()
True

$ nsys -v
NVIDIA Nsight Systems version 2023.4.4.54-234433681190v0


$ nvidia-smi
Tue Jun 11 00:19:48 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  Tesla T4                       Off |   00000000:81:00.0 Off |                    0 |
| N/A   41C    P0             27W /   70W |       0MiB /  15360MiB |      8%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

This answer helped me to get GPU recognized by the profiler.
I just didn’t load external cuda and relied on the internal one. Downside is that I was getting the ‘Distributed’ view, but now it is not showing.