DDP -- WARNING:urllib3.connectionpool:Retrying

Mona_Jalal · May 17, 2023, 12:42pm

Update: I am not sure if this is a error due to mlflow, Azure Cluster, or PyTorch. Please check my issue followup here: [BUG] extra ghost runs due to failed connection retries · Issue #8238 · mlflow/mlflow · GitHub in summary, this problem doesn’t happen if use single node multi-GPU for DDP.

Any thought how to fix this WARNING? I am also not 100% sure about correctness of this DDP code for training and evaluation of CIFAR10 using pre-trained ResNet50 network. For example, I am not sure 100% about correctness of all_reduce here.

# Average the test accuracy across all processes

correct = torch.tensor(correct, dtype=torch.int8)
correct = correct.to(device)
torch.distributed.all_reduce(correct, op=torch.distributed.ReduceOp.SUM)
total = torch.tensor(total, dtype=torch.torch.int8)
total = total.to(device)
torch.distributed.all_reduce(total, op=torch.distributed.ReduceOp.SUM)
test_accuracy = 100 * correct / total
test_accuracy /= world_size

also, what about the loss and running loss? do we need to use all_reduce for them as well?

import time
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import mlflow
import os
import datetime

import configparser
import logging
import argparse

from PIL import Image

import ssl
ssl._create_default_https_context = ssl._create_unverified_context


start_time = time.time()


# Set the seed for reproducibility
torch.manual_seed(42)

# Set up the data loading parameters
batch_size = 128
num_epochs = 100
num_workers = 4
pin_memory = True

# Get the world size and rank to determine the process group
world_size = int(os.environ['WORLD_SIZE'])
world_rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])

is_distributed = world_size > 1

if is_distributed:
    batch_size = batch_size // world_size
    batch_size = max(batch_size, 1)

# Set the backend to NCCL for distributed training
dist.init_process_group(backend="nccl",
                        init_method="env://",
                        world_size=world_size,
                        rank=world_rank)

# Set the device to the current local rank
torch.cuda.set_device(local_rank)
device = torch.device('cuda', local_rank)

dist.barrier()

# Define the transforms for the dataset
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
])

# Load the CIFAR-10 dataset

data_root = './data_' + str(world_rank)
train_dataset = torchvision.datasets.CIFAR10(root=data_root, train=True, download=True, transform=transform_train)
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=world_size, rank=world_rank)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=(train_sampler is None), num_workers=num_workers, pin_memory=pin_memory, sampler=train_sampler)

test_dataset = torchvision.datasets.CIFAR10(root=data_root, train=False, download=True, transform=transform_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)

# Define the ResNet50 model
model = torchvision.models.resnet50(pretrained=True)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 10)

# Move the model to the GPU
model = model.to(device)

# Wrap the model with DistributedDataParallel
if is_distributed:
    model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model for the specified number of epochs
for epoch in range(num_epochs):
    running_loss = 0.0
    train_sampler.set_epoch(epoch) ### why is this line necessary??
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()

    print('[Epoch %d] loss: %.3f' % (epoch + 1, running_loss))

    # Log the loss and running loss as MLFlow metrics
    mlflow.log_metric("loss", loss.item())
    mlflow.log_metric("running loss", running_loss)

dist.barrier()
# Save the trained model
if world_rank == 0:
    checkpoints_path = "train_checkpoints"
    os.makedirs(checkpoints_path, exist_ok=True)
    torch.save(model.state_dict(), '{}/{}-{}.pth'.format(checkpoints_path, 'resnet50_cifar10', world_rank))
    mlflow.pytorch.log_model(model, "resnet50_cifar10_{}.pth".format(world_rank))
    # mlflow.log_artifact('{}/{}-{}.pth'.format(checkpoints_path, 'resnet50_cifar10', world_rank), artifact_path="model_state_dict")

# Evaluate the model on the test set and save inference on 6 random images
correct = 0
total = 0
with torch.no_grad():
    fig, axs = plt.subplots(2, 3, figsize=(8, 6), dpi=100)
    axs = axs.flatten()
    count = 0
    for data in test_loader:
        if count == 6:
            break
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Save the inference on the 6 random images
        if count < 6:
            image = np.transpose(inputs[0].cpu().numpy(), (1, 2, 0))
            confidence = torch.softmax(outputs, dim=1)[0][predicted[0]].cpu().numpy()
            class_name = test_dataset.classes[predicted[0]]
            axs[count].imshow(image)
            axs[count].set_title(f'Class: {class_name}\nConfidence: {confidence:.2f}')
            axs[count].axis('off')
            count += 1

# Average the test accuracy across all processes

correct = torch.tensor(correct, dtype=torch.int8)
correct = correct.to(device)
torch.distributed.all_reduce(correct, op=torch.distributed.ReduceOp.SUM)
total = torch.tensor(total, dtype=torch.torch.int8)
total = total.to(device)
torch.distributed.all_reduce(total, op=torch.distributed.ReduceOp.SUM)
test_accuracy = 100 * correct / total
test_accuracy /= world_size

print('Test accuracy: %.2f %%' % test_accuracy)

# Save the plot with the 6 random images and their predicted classes and prediction confidence
test_img_file_name = 'test_images_' + str(world_rank) + '.png'
plt.savefig(test_img_file_name)

# Log the test accuracy and elapsed time to MLflow
mlflow.log_metric("test accuracy", test_accuracy)

end_time = time.time()
elapsed_time = end_time - start_time
print('Elapsed time: ', elapsed_time)
mlflow.log_metric("elapsed time", elapsed_time)

# Save the plot with the 6 random images and their predicted classes and prediction confidence as an artifact in MLflow
image = Image.open(test_img_file_name)
image = image.convert('RGBA')
image_buffer = np.array(image)
image_buffer = image_buffer[:, :, [2, 1, 0, 3]]
image_buffer = np.ascontiguousarray(image_buffer)
artifact_file_name = "inference_on_test_images_" + str(world_rank) + ".png"
mlflow.log_image(image_buffer, artifact_file=artifact_file_name)

# End the MLflow run
if mlflow.active_run():
    mlflow.end_run()

dist.destroy_process_group()

here’s part of my pipeline.yaml:

resources:
      # instance_count: 1 # number of nodes
      instance_count: 4
    distribution:
      type: pytorch
      # process_count_per_instance: 1 # number of gpus
      process_count_per_instance: 4

    # NOTE: set env var if needed
    environment_variables:
      NCCL_DEBUG: "INFO" # adjusts the level of info from NCCL tests

      # NCCL_TOPO_FILE: "/opt/microsoft/ndv4-topo.xml" # Use specific topology file for A100

      # NCCL_IB_PCI_RELAXED_ORDERING: "1" # Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
      NCCL_IB_DISABLE: "1" # force disable infiniband (if set to "1")
      # NCCL_NET_PLUGIN: "none" # to force NET/Plugin off (no rdma/sharp plugin at all)
      # NCCL_NET: "Socket" # to force node-to-node comm to use Socket (slow)
      NCCL_SOCKET_IFNAME: "eth0" # to force Socket comm to use eth0 (use NCCL_NET=Socket)
      # NCCL_SOCKET_IFNAME: "lo"

      # UCX_IB_PCI_RELAXED_ORDERING: "on"
      # UCX_TLS: "tcp"
      # UCX_NET_DEVICES: "eth0" # if you have Error: Failed to resolve UCX endpoint...

      CUDA_DEVICE_ORDER: "PCI_BUS_ID" # ordering of gpus  # do we need to uncomment this? why?

      TORCH_DISTRIBUTED_DEBUG: "DETAIL"

and here’s the newly produced results:

b022059e50144c35858c014326950bf2000000:38:38 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
/47505500-0004-0000-3130-444531303244/pci0004:00/0004:00:00.0/../max_link_speed, ignoring
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0004-0000-3130-444531303244/pci0004:00/0004:00:00.0/../max_link_width, ignoring
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Topology detection: network path /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/000d3ae3-7594-000d-3ae3-7594000d3ae3 is not a PCI device (vmbus). Attaching to first CPU
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Attribute coll of node net not found
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO === System : maxWidth 5.0 totalWidth 12.0 ===
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO CPU/0 (1/1/1)
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO + PCI[5000.0] - NIC/0
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO                 + NET[5.0] - NET/0 (0/0/5.000000)
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO + PCI[12.0] - GPU/100000 (0)
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO + PCI[12.0] - GPU/200000 (1)
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO + PCI[12.0] - GPU/300000 (2)
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO + PCI[12.0] - GPU/400000 (3)
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO ==========================================
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO GPU/100000 :GPU/100000 (0/5000.000000/LOC) GPU/200000 (2/12.000000/PHB) GPU/300000 (2/12.000000/PHB) GPU/400000 (2/12.000000/PHB) CPU/0 (1/12.000000/PHB) NET/0 (3/5.000000/PHB) 
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO GPU/200000 :GPU/100000 (2/12.000000/PHB) GPU/200000 (0/5000.000000/LOC) GPU/300000 (2/12.000000/PHB) GPU/400000 (2/12.000000/PHB) CPU/0 (1/12.000000/PHB) NET/0 (3/5.000000/PHB) 
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO GPU/300000 :GPU/100000 (2/12.000000/PHB) GPU/200000 (2/12.000000/PHB) GPU/300000 (0/5000.000000/LOC) GPU/400000 (2/12.000000/PHB) CPU/0 (1/12.000000/PHB) NET/0 (3/5.000000/PHB) 
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO GPU/400000 :GPU/100000 (2/12.000000/PHB) GPU/200000 (2/12.000000/PHB) GPU/300000 (2/12.000000/PHB) GPU/400000 (0/5000.000000/LOC) CPU/0 (1/12.000000/PHB) NET/0 (3/5.000000/PHB) 
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO NET/0 :GPU/100000 (3/5.000000/PHB) GPU/200000 (3/5.000000/PHB) GPU/300000 (3/5.000000/PHB) GPU/400000 (3/5.000000/PHB) CPU/0 (2/5.000000/PHB) NET/0 (0/5000.000000/LOC) 
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Pattern 4, crossNic 0, nChannels 1, speed 5.000000/5.000000, type PHB/PHB, sameChannels 1
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO  0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Pattern 1, crossNic 0, nChannels 1, speed 6.000000/5.000000, type PHB/PHB, sameChannels 1
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO  0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Pattern 3, crossNic 0, nChannels 0, speed 0.000000/0.000000, type NVL/PIX, sameChannels 1
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Tree 0 : -1 -> 0 -> 1/8/-1
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Tree 1 : 4 -> 0 -> 1/-1/-1
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 00/02 :    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 01/02 :    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Ring 00 : 15 -> 0 -> 1
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Ring 01 : 15 -> 0 -> 1
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] 1/-1/-1->0->4
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Setting affinity for GPU 0 to 0fff
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 00 : 15[400000] -> 0[100000] [receive] via NET/Socket/0
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 01 : 15[400000] -> 0[100000] [receive] via NET/Socket/0
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 00 : 0[100000] -> 1[200000] via direct shared memory
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 01 : 0[100000] -> 1[200000] via direct shared memory
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Connected all rings
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 01 : 0[100000] -> 4[100000] [send] via NET/Socket/0
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 00 : 8[100000] -> 0[100000] [receive] via NET/Socket/0
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 00 : 0[100000] -> 8[100000] [send] via NET/Socket/0
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Channel 01 : 4[100000] -> 0[100000] [receive] via NET/Socket/0
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO Connected all trees
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 8/8/512
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer
b022059e50144c35858c014326950bf2000000:38:205 [0] NCCL INFO comm 0x14dab4001240 rank 0 nranks 16 cudaDev 0 busId 100000 - Init COMPLETE
b022059e50144c35858c014326950bf2000000:38:38 [0] NCCL INFO Launch mode Parallel
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data_0/cifar-10-python.tar.gz

  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 512000/170498071 [00:00<00:33, 5064354.10it/s]
  2%|▏         | 3909632/170498071 [00:00<00:07, 21994007.18it/s]
...
 97%|█████████▋| 165796864/170498071 [00:04<00:00, 58049136.87it/s]
170499072it [00:04, 38095858.41it/s]                               Extracting ./data_0/cifar-10-python.tar.gz to ./data_0
Files already downloaded and verified

[Epoch 1] loss: 572.919
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 2] loss: 456.929
[Epoch 3] loss: 371.837
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 4] loss: 378.591
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 5] loss: 328.165
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 6] loss: 317.507
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 7] loss: 286.207
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 8] loss: 303.266
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 9] loss: 274.409
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 10] loss: 286.861
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 11] loss: 257.423
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 12] loss: 269.233
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 13] loss: 254.974
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 14] loss: 258.292
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 15] loss: 235.025
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 16] loss: 249.847
[Epoch 17] loss: 238.604
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 18] loss: 215.929
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 19] loss: 210.027

WARNING:urllib3.connectionpool:Retrying (Retry(total=3, connect=4, read=4, redirect=5, status=5)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x14db37800790>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 82] loss: 75.608
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
[Epoch 83] loss: 73.162

b022059e50144c35858c014326950bf2000000:38:209 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.0.0.4<52726>
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO include/socket.h:445 -> 2
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO include/socket.h:457 -> 2
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:229 -> 2

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

b022059e50144c35858c014326950bf2000000:38:209 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
b022059e50144c35858c014326950bf2000000:38:209 [0] NCCL INFO bootstrap.cc:231 -> 1

b022059e50144c35858c014326950bf2000000:38:209 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 126)

[Epoch 99] loss: 63.380
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
2023/05/17 09:13:35 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /tmp/tmpc4iktykx/model/data, flavor: pytorch), fall back to return ['torch==1.11.0', 'cloudpickle==2.0.0']. Set logging level to DEBUG to see the full traceback.
[Epoch 100] loss: 55.532
Test accuracy: -inf %
Elapsed time:  49817.131615400314

Here’s the inference:
Screenshot from 2023-05-17 08-09-20

As you see I have a WARNING that could have caused some problem.

[Epoch 99] loss: 63.380
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v1.0/subscriptions/number/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/log-metric
2023/05/17 09:13:35 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /tmp/tmpc4iktykx/model/data, flavor: pytorch), fall back to return ['torch==1.11.0', 'cloudpickle==2.0.0']. Set logging level to DEBUG to see the full traceback.

H-Huang · May 22, 2023, 4:32pm

Any thought how to fix this WARNING?

As mentioned in the message, that particular warning is coming from mlflow. It seems like its doing an API call to report some metrics so it doesn’t seem particularly relevant though.

I am also not 100% sure about correctness of this DDP code for training and evaluation of CIFAR10 using pre-trained ResNet50 network.

To test accuracy all_reducing the correct and all_reducing the total sounds fine. You can also just calculate the accuracy locally and all_reduce that by the world size which would reduce it to 1 all_reduce communication.

also, what about the loss and running loss? do we need to use all_reduce for them as well?

loss in DDP is calculated locally, gradients are aggregated in DDP via all_reduce (this is handled for you). You likely do not need to all_reduce loss unless your algorithm does something special

Mona_Jalal · May 23, 2023, 2:33pm

mlflow github has confirmed this issue happens in Azure. not sure what I should do now!

Mona_Jalal · May 23, 2023, 6:24pm

Do you know what is the fix for

2023/05/23 15:58:00 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /tmp/tmpkdzv7ohy/model/data, flavor: pytorch), fall back to return ['torch==1.13.0', 'cloudpickle==2.2.0']. Set logging level to DEBUG to see the full traceback.

I am getting this error again here:

total 0
NCCL version is:  (2, 14, 3)
System information: Linux #36~20.04.1-Ubuntu SMP Tue Dec 6 17:00:26 UTC 2022
Python version: 3.8.10
MLflow version: 2.3.2
MLflow module location: /usr/local/lib/python3.8/dist-packages/mlflow/__init__.py
Tracking URI: URI
Registry URI: URI
MLflow environment variables: 
  MLFLOW_DISABLE_ENV_MANAGER_CONDA_WARNING: True
  MLFLOW_EXPERIMENT_ID: 03bf0c01-34b3-4b8f-9713-b744f0350832
  MLFLOW_EXPERIMENT_NAME: dev_CIFAR10_DDP_train_test2
  MLFLOW_RUN_ID: 4e3ace3d-ffc1-4c59-be7f-593f9fe4fb42
  MLFLOW_TRACKING_TOKEN: token
  MLFLOW_TRACKING_URI: URI
MLflow dependencies: 
  Flask: 2.3.2
  Jinja2: 3.1.2
  alembic: 1.11.1
  click: 8.1.3
  cloudpickle: 2.2.0
  databricks-cli: 0.17.7
  docker: 6.1.2
  entrypoints: 0.4
  gitpython: 3.1.31
  gunicorn: 20.1.0
  importlib-metadata: 5.1.0
  markdown: 3.4.1
  matplotlib: 3.5.2
  numpy: 1.22.2
  packaging: 22.0
  pandas: 1.5.2
  protobuf: 3.20.1
  pyarrow: 9.0.0
  pytz: 2022.6
  pyyaml: 6.0
  querystring-parser: 1.2.4
  requests: 2.28.1
  scikit-learn: 0.24.2
  scipy: 1.6.3
  sqlalchemy: 2.0.15
  sqlparse: 0.4.4
c2e1a9b253bb44159e96b7b822016fac000000:38:38 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
c2e1a9b253bb44159e96b7b822016fac000000:38:38 [0] NCCL INFO Bootstrap : Using eth0:10.0.0.4<0>
c2e1a9b253bb44159e96b7b822016fac000000:38:38 [0] NCCL INFO NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.
c2e1a9b253bb44159e96b7b822016fac000000:38:38 [0] NCCL INFO NET/Plugin: Loaded net plugin NCCL RDMA Plugin (v5)
c2e1a9b253bb44159e96b7b822016fac000000:38:38 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
c2e1a9b253bb44159e96b7b822016fac000000:38:38 [0] NCCL INFO NET/Plugin: Loaded coll plugin SHARP (v5)
c2e1a9b253bb44159e96b7b822016fac000000:38:38 [0] NCCL INFO cudaDriverVersion 11040
NCCL version 2.14.3+cuda11.7
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO P2P plugin IBext
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO NET/IB : No device found.
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO NET/Socket : Using [0]eth0:10.0.0.4<0>
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Using network Socket
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0001-0000-3130-444531303244/pci0001:00/0001:00:00.0/../max_link_speed, ignoring
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0001-0000-3130-444531303244/pci0001:00/0001:00:00.0/../max_link_width, ignoring
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0002-0000-3130-444531303244/pci0002:00/0002:00:00.0/../max_link_speed, ignoring
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0002-0000-3130-444531303244/pci0002:00/0002:00:00.0/../max_link_width, ignoring
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0003-0000-3130-444531303244/pci0003:00/0003:00:00.0/../max_link_speed, ignoring
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0003-0000-3130-444531303244/pci0003:00/0003:00:00.0/../max_link_width, ignoring
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0004-0000-3130-444531303244/pci0004:00/0004:00:00.0/../max_link_speed, ignoring
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0004-0000-3130-444531303244/pci0004:00/0004:00:00.0/../max_link_width, ignoring
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Topology detection: network path /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/000d3ada-c617-000d-3ada-c617000d3ada is not a PCI device (vmbus). Attaching to first CPU
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO === System : maxBw 5.0 totalBw 12.0 ===
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO CPU/0 (1/1/1)
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO + PCI[5000.0] - NIC/0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO                 + NET[5.0] - NET/0 (0/0/5.000000)
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO + PCI[12.0] - GPU/100000 (0)
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO + PCI[12.0] - GPU/200000 (1)
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO + PCI[12.0] - GPU/300000 (2)
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO + PCI[12.0] - GPU/400000 (3)
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO ==========================================
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO GPU/100000 :GPU/100000 (0/5000.000000/LOC) GPU/200000 (2/12.000000/PHB) GPU/300000 (2/12.000000/PHB) GPU/400000 (2/12.000000/PHB) CPU/0 (1/12.000000/PHB) NET/0 (3/5.000000/PHB) 
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO GPU/200000 :GPU/100000 (2/12.000000/PHB) GPU/200000 (0/5000.000000/LOC) GPU/300000 (2/12.000000/PHB) GPU/400000 (2/12.000000/PHB) CPU/0 (1/12.000000/PHB) NET/0 (3/5.000000/PHB) 
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO GPU/300000 :GPU/100000 (2/12.000000/PHB) GPU/200000 (2/12.000000/PHB) GPU/300000 (0/5000.000000/LOC) GPU/400000 (2/12.000000/PHB) CPU/0 (1/12.000000/PHB) NET/0 (3/5.000000/PHB) 
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO GPU/400000 :GPU/100000 (2/12.000000/PHB) GPU/200000 (2/12.000000/PHB) GPU/300000 (2/12.000000/PHB) GPU/400000 (0/5000.000000/LOC) CPU/0 (1/12.000000/PHB) NET/0 (3/5.000000/PHB) 
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO NET/0 :GPU/100000 (3/5.000000/PHB) GPU/200000 (3/5.000000/PHB) GPU/300000 (3/5.000000/PHB) GPU/400000 (3/5.000000/PHB) CPU/0 (2/5.000000/PHB) NET/0 (0/5000.000000/LOC) 
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Setting affinity for GPU 0 to 0fff
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Pattern 4, crossNic 0, nChannels 1, bw 5.000000/5.000000, type PHB/PHB, sameChannels 1
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO  0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Pattern 1, crossNic 0, nChannels 1, bw 6.000000/5.000000, type PHB/PHB, sameChannels 1
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO  0 : NET/0 GPU/0 GPU/1 GPU/2 GPU/3 NET/0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Pattern 3, crossNic 0, nChannels 0, bw 0.000000/0.000000, type NVL/PIX, sameChannels 1
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Tree 0 : -1 -> 0 -> 1/8/-1
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Tree 1 : 4 -> 0 -> 1/-1/-1
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 00/02 :    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 01/02 :    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Ring 00 : 15 -> 0 -> 1
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Ring 01 : 15 -> 0 -> 1
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] 1/-1/-1->0->4
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 00/0 : 15[400000] -> 0[100000] [receive] via NET/Socket/0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 01/0 : 15[400000] -> 0[100000] [receive] via NET/Socket/0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 00 : 0[100000] -> 1[200000] via SHM/direct/direct
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 01 : 0[100000] -> 1[200000] via SHM/direct/direct
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Connected all rings
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 01/0 : 0[100000] -> 4[100000] [send] via NET/Socket/0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 00/0 : 8[100000] -> 0[100000] [receive] via NET/Socket/0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 00/0 : 0[100000] -> 8[100000] [send] via NET/Socket/0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO Channel 01/0 : 4[100000] -> 0[100000] [receive] via NET/Socket/0
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NWorld size: 16
local rank is 0 and world rank is 0
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data_0/cifar-10-python.tar.gz

  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 425984/170498071 [00:00<00:41, 4051194.46it/s]
  1%|          | 1212416/170498071 [00:00<00:27, 6127606.40it/s]
  1%|          | 1966080/170498071 [00:00<00:25, 6719697.26it/s]
  2%|▏         | 2686976/170498071 [00:00<00:24, 6902111.64it/s]
  2%|▏         | 3407872/170498071 [00:00<00:23, 6973992.35it/s]
  2%|▏         | 4128768/170498071 [00:00<00:23, 6947804.88it/s]
  3%|▎         | 4849664/170498071 [00:00<00:23, 6979660.32it/s]
  3%|▎         | 5570560/170498071 [00:00<00:23, 6989207.64it/s]
  4%|▎         | 6291456/170498071 [00:00<00:23, 7021770.44it/s]
  4%|▍         | 7012352/170498071 [00:01<00:23, 6972561.05it/s]
  5%|▍         | 7733248/170498071 [00:01<00:23, 7028229.20it/s]
  5%|▍         | 8486912/170498071 [00:01<00:22, 7053150.21it/s]
  5%|▌         | 9207808/170498071 [00:01<00:22, 7028946.80it/s]
  6%|▌         | 9928704/170498071 [00:01<00:22, 7048035.82it/s]
  6%|▋         | 10682368/170498071 [00:01<00:22, 7096951.30it/s]
  7%|▋         | 11403264/170498071 [00:01<00:22, 7061243.58it/s]
  7%|▋         | 12124160/170498071 [00:01<00:22, 7101386.04it/s]
  8%|▊         | 12845056/170498071 [00:01<00:22, 7112616.26it/s]
  8%|▊         | 13565952/170498071 [00:01<00:22, 7068835.44it/s]
  8%|▊         | 14286848/170498071 [00:02<00:22, 7072575.76it/s]
  9%|▉         | 15040512/170498071 [00:02<00:21, 7106870.37it/s]
  9%|▉         | 15761408/170498071 [00:02<00:21, 7058237.45it/s]
 10%|▉         | 16482304/170498071 [00:02<00:21, 7027461.90it/s]
 10%|█         | 17203200/170498071 [00:02<00:22, 6917416.22it/s]
 11%|█         | 17924096/170498071 [00:02<00:22, 6819952.97it/s]
 11%|█         | 18612224/170498071 [00:02<00:22, 6797515.99it/s]
 11%|█▏        | 19300352/170498071 [00:02<00:22, 6787252.95it/s]
 12%|█▏        | 19988480/170498071 [00:02<00:22, 6679271.63it/s]
 12%|█▏        | 20676608/170498071 [00:02<00:22, 6675154.79it/s]
 13%|█▎        | 21364736/170498071 [00:03<00:22, 6668705.91it/s]
 13%|█▎        | 22052864/170498071 [00:03<00:22, 6675795.28it/s]
 13%|█▎        | 22740992/170498071 [00:03<00:22, 6710075.90it/s]
 14%|█▎        | 23429120/170498071 [00:03<00:22, 6672703.60it/s]
 14%|█▍        | 24117248/170498071 [00:03<00:22, 6634247.18it/s]
 15%|█▍        | 24805376/170498071 [00:03<00:21, 6684316.49it/s]
 15%|█▍        | 25493504/170498071 [00:03<00:21, 6719925.15it/s]
 15%|█▌        | 26181632/170498071 [00:03<00:21, 6674898.37it/s]
 16%|█▌        | 26869760/170498071 [00:03<00:21, 6633634.10it/s]
 16%|█▌        | 27557888/170498071 [00:04<00:21, 6670927.06it/s]
 17%|█▋        | 28246016/170498071 [00:04<00:21, 6690491.10it/s]
 17%|█▋        | 28934144/170498071 [00:04<00:21, 6640116.24it/s]
 17%|█▋        | 29622272/170498071 [00:04<00:21, 6663148.00it/s]
 18%|█▊        | 30310400/170498071 [00:04<00:21, 6602519.38it/s]
 18%|█▊        | 30998528/170498071 [00:04<00:21, 6600308.55it/s]
 19%|█▊        | 31686656/170498071 [00:04<00:20, 6635708.27it/s]
 19%|█▉        | 32374784/170498071 [00:04<00:20, 6649579.58it/s]
 19%|█▉        | 33062912/170498071 [00:04<00:20, 6676432.06it/s]
 20%|█▉        | 33751040/170498071 [00:04<00:20, 6561318.90it/s]
 20%|██        | 34439168/170498071 [00:05<00:20, 6499079.50it/s]
 21%|██        | 35094528/170498071 [00:05<00:21, 6442573.84it/s]
 21%|██        | 35749888/170498071 [00:05<00:21, 6345369.20it/s]
 21%|██▏       | 36405248/170498071 [00:05<00:21, 6299500.32it/s]
 22%|██▏       | 37060608/170498071 [00:05<00:21, 6253952.93it/s]
 22%|██▏       | 37715968/170498071 [00:05<00:21, 6250342.79it/s]
 23%|██▎       | 38371328/170498071 [00:05<00:21, 6193969.38it/s]
 23%|██▎       | 38993920/170498071 [00:05<00:21, 6194744.02it/s]
 23%|██▎       | 39616512/170498071 [00:05<00:21, 6133269.79it/s]
 24%|██▎       | 40271872/170498071 [00:06<00:21, 6199244.59it/s]
 24%|██▍       | 40894464/170498071 [00:06<00:20, 6195875.25it/s]
 24%|██▍       | 41517056/170498071 [00:06<00:20, 6151321.93it/s]
 25%|██▍       | 42139648/170498071 [00:06<00:21, 6050892.40it/s]
 25%|██▌       | 42762240/170498071 [00:06<00:21, 5992202.96it/s]
 25%|██▌       | 43384832/170498071 [00:06<00:21, 5967480.91it/s]
 26%|██▌       | 44007424/170498071 [00:06<00:21, 5970460.60it/s]
 26%|██▌       | 44630016/170498071 [00:06<00:21, 5909422.75it/s]
 27%|██▋       | 45252608/170498071 [00:06<00:21, 5920952.61it/s]
 27%|██▋       | 45875200/170498071 [00:06<00:21, 5874775.42it/s]
 27%|██▋       | 46497792/170498071 [00:07<00:20, 5923481.34it/s]
 28%|██▊       | 47120384/170498071 [00:07<00:20, 5941560.22it/s]
 28%|██▊       | 47742976/170498071 [00:07<00:20, 5978063.41it/s]
 28%|██▊       | 48398336/170498071 [00:07<00:20, 6071180.62it/s]
 29%|██▉       | 49020928/170498071 [00:07<00:19, 6097133.11it/s]
 29%|██▉       | 49643520/170498071 [00:07<00:19, 6086650.44it/s]
 29%|██▉       | 50266112/170498071 [00:07<00:19, 6127087.51it/s]
 30%|██▉       | 50888704/170498071 [00:07<00:19, 6126752.37it/s]
 30%|███       | 51544064/170498071 [00:07<00:19, 6155440.11it/s]
 31%|███       | 52166656/170498071 [00:07<00:19, 6109025.63it/s]
 31%|███       | 52789248/170498071 [00:08<00:19, 6082787.10it/s]
 31%|███▏      | 53411840/170498071 [00:08<00:19, 6042397.36it/s]
 32%|███▏      | 54034432/170498071 [00:08<00:19, 6012295.86it/s]
 32%|███▏      | 54657024/170498071 [00:08<00:19, 5959695.95it/s]
 32%|███▏      | 55279616/170498071 [00:08<00:19, 5948113.18it/s]
 33%|███▎      | 55902208/170498071 [00:08<00:19, 5939905.89it/s]
 33%|███▎      | 56524800/170498071 [00:08<00:19, 5923514.16it/s]
 34%|███▎      | 57147392/170498071 [00:08<00:19, 5877004.28it/s]
 34%|███▍      | 57737216/170498071 [00:08<00:19, 5861968.29it/s]
 34%|███▍      | 58359808/170498071 [00:09<00:18, 5928779.64it/s]
 35%|███▍      | 58982400/170498071 [00:09<00:18, 5916714.43it/s]
 35%|███▍      | 59604992/170498071 [00:09<00:18, 5925438.62it/s]
 35%|███▌      | 60227584/170498071 [00:09<00:18, 5923859.66it/s]
 36%|███▌      | 60850176/170498071 [00:09<00:18, 5846039.95it/s]
 36%|███▌      | 61440000/170498071 [00:09<00:18, 5756679.33it/s]
 36%|███▋      | 62029824/170498071 [00:09<00:19, 5692002.15it/s]
 37%|███▋      | 62619648/170498071 [00:09<00:19, 5629130.70it/s]
 37%|███▋      | 63209472/170498071 [00:09<00:19, 5506533.90it/s]
 37%|███▋      | 63766528/170498071 [00:09<00:19, 5519178.53it/s]
 38%|███▊      | 64323584/170498071 [00:10<00:19, 5439273.39it/s]
 38%|███▊      | 64880640/170498071 [00:10<00:19, 5367179.05it/s]
 38%|███▊      | 65437696/170498071 [00:10<00:19, 5382613.34it/s]
 39%|███▊      | 65994752/170498071 [00:10<00:19, 5351279.41it/s]
 39%|███▉      | 66551808/170498071 [00:10<00:19, 5305658.94it/s]
 39%|███▉      | 67108864/170498071 [00:10<00:19, 5273306.92it/s]
 40%|███▉      | 67665920/170498071 [00:10<00:19, 5224133.95it/s]
 40%|███▉      | 68190208/170498071 [00:10<00:19, 5218515.10it/s]
 40%|████      | 68714496/170498071 [00:10<00:19, 5200196.53it/s]
 41%|████      | 69271552/170498071 [00:11<00:19, 5211929.96it/s]
 41%|████      | 69828608/170498071 [00:11<00:19, 5251876.47it/s]
 41%|████▏     | 70385664/170498071 [00:11<00:19, 5234716.58it/s]
 42%|████▏     | 70909952/170498071 [00:11<00:19, 5227907.75it/s]
 42%|████▏     | 71434240/170498071 [00:11<00:19, 5204631.00it/s]
 42%|████▏     | 71991296/170498071 [00:11<00:18, 5252799.79it/s]
 43%|████▎     | 72548352/170498071 [00:11<00:18, 5237993.68it/s]
 43%|████▎     | 73105408/170498071 [00:11<00:18, 5266147.69it/s]
 43%|████▎     | 73662464/170498071 [00:11<00:18, 5323560.57it/s]
 44%|████▎     | 74219520/170498071 [00:11<00:18, 5335747.23it/s]
 44%|████▍     | 74776576/170498071 [00:12<00:18, 5291925.92it/s]
 44%|████▍     | 75333632/170498071 [00:12<00:17, 5303561.28it/s]
 45%|████▍     | 75890688/170498071 [00:12<00:17, 5329737.79it/s]
 45%|████▍     | 76447744/170498071 [00:12<00:17, 5311600.38it/s]
 45%|████▌     | 77004800/170498071 [00:12<00:17, 5304362.20it/s]
 45%|████▌     | 77561856/170498071 [00:12<00:17, 5314926.39it/s]
 46%|████▌     | 78118912/170498071 [00:12<00:17, 5307866.36it/s]
 46%|████▌     | 78675968/170498071 [00:12<00:17, 5274541.25it/s]
 46%|████▋     | 79233024/170498071 [00:12<00:17, 5277325.20it/s]
 47%|████▋     | 79790080/170498071 [00:13<00:17, 5329072.43it/s]
 47%|████▋     | 80347136/170498071 [00:13<00:17, 5256357.41it/s]
 47%|████▋     | 80904192/170498071 [00:13<00:16, 5285562.18it/s]
 48%|████▊     | 81461248/170498071 [00:13<00:16, 5328045.08it/s]
 48%|████▊     | 82018304/170498071 [00:13<00:16, 5273367.85it/s]
 48%|████▊     | 82575360/170498071 [00:13<00:16, 5309351.77it/s]
 49%|████▉     | 83132416/170498071 [00:13<00:16, 5302488.72it/s]
 49%|████▉     | 83689472/170498071 [00:13<00:16, 5270337.77it/s]
 49%|████▉     | 84246528/170498071 [00:13<00:16, 5292259.36it/s]
 50%|████▉     | 84803584/170498071 [00:13<00:16, 5281781.73it/s]
 50%|█████     | 85360640/170498071 [00:14<00:16, 5285213.75it/s]
 50%|█████     | 85917696/170498071 [00:14<00:16, 5270999.64it/s]
 51%|█████     | 86474752/170498071 [00:14<00:15, 5252745.94it/s]
 51%|█████     | 87031808/170498071 [00:14<00:15, 5321094.01it/s]
 51%|█████▏    | 87588864/170498071 [00:14<00:15, 5293143.97it/s]
 52%|█████▏    | 88145920/170498071 [00:14<00:15, 5243445.27it/s]
 52%|█████▏    | 88702976/170498071 [00:14<00:15, 5303567.05it/s]
 52%|█████▏    | 89260032/170498071 [00:14<00:15, 5285539.61it/s]
 53%|█████▎    | 89817088/170498071 [00:14<00:15, 5261291.49it/s]
 53%|█████▎    | 90374144/170498071 [00:15<00:15, 5218757.26it/s]
 53%|█████▎    | 90931200/170498071 [00:15<00:15, 5243954.37it/s]
 54%|█████▎    | 91488256/170498071 [00:15<00:15, 5195495.13it/s]
 54%|█████▍    | 92012544/170498071 [00:15<00:15, 5190651.11it/s]
 54%|█████▍    | 92536832/170498071 [00:15<00:15, 5116026.86it/s]
 55%|█████▍    | 93061120/170498071 [00:15<00:15, 4998901.49it/s]
 55%|█████▍    | 93585408/170498071 [00:15<00:15, 4979507.61it/s]
 55%|█████▌    | 94109696/170498071 [00:15<00:15, 4908240.83it/s]
 55%|█████▌    | 94601216/170498071 [00:15<00:15, 4863983.04it/s]
 56%|█████▌    | 95092736/170498071 [00:16<00:15, 4822737.11it/s]
 56%|█████▌    | 95584256/170498071 [00:16<00:15, 4840354.83it/s]
 56%|█████▋    | 96075776/170498071 [00:16<00:15, 4807205.61it/s]
 57%|█████▋    | 96567296/170498071 [00:16<00:15, 4801175.04it/s]
 57%|█████▋    | 97058816/170498071 [00:16<00:15, 4715118.41it/s]
 57%|█████▋    | 97550336/170498071 [00:16<00:15, 4712424.60it/s]
 58%|█████▊    | 98041856/170498071 [00:16<00:15, 4709224.06it/s]
 58%|█████▊    | 98533376/170498071 [00:16<00:15, 4767119.02it/s]
 58%|█████▊    | 99024896/170498071 [00:16<00:15, 4737136.63it/s]
 58%|█████▊    | 99516416/170498071 [00:16<00:14, 4761833.52it/s]
 59%|█████▊    | 100007936/170498071 [00:17<00:14, 4759758.47it/s]
 59%|█████▉    | 100499456/170498071 [00:17<00:14, 4796815.41it/s]
 59%|█████▉    | 100990976/170498071 [00:17<00:14, 4793988.82it/s]
 60%|█████▉    | 101515264/170498071 [00:17<00:14, 4833556.45it/s]
 60%|█████▉    | 102006784/170498071 [00:17<00:14, 4833080.49it/s]
 60%|██████    | 102531072/170498071 [00:17<00:13, 4871913.57it/s]
 60%|██████    | 103055360/170498071 [00:17<00:13, 4910433.79it/s]
 61%|██████    | 103546880/170498071 [00:17<00:13, 4877840.01it/s]
 61%|██████    | 104038400/170498071 [00:17<00:13, 4826229.17it/s]
 61%|██████▏   | 104529920/170498071 [00:17<00:13, 4844045.24it/s]
 62%|██████▏   | 105054208/170498071 [00:18<00:13, 4884564.26it/s]
 62%|██████▏   | 105545728/170498071 [00:18<00:13, 4853982.38it/s]
 62%|██████▏   | 106070016/170498071 [00:18<00:13, 4882676.31it/s]
 63%|██████▎   | 106561536/170498071 [00:18<00:13, 4838961.02it/s]
 63%|██████▎   | 107053056/170498071 [00:18<00:13, 4852534.02it/s]
 63%|██████▎   | 107544576/170498071 [00:18<00:13, 4807564.04it/s]
 63%|██████▎   | 108068864/170498071 [00:18<00:12, 4850989.22it/s]
 64%|██████▎   | 108625920/170498071 [00:18<00:12, 5037621.78it/s]
 64%|██████▍   | 109215744/170498071 [00:18<00:11, 5231882.55it/s]
 64%|██████▍   | 109805568/170498071 [00:19<00:11, 5374236.14it/s]
 65%|██████▍   | 110460928/170498071 [00:19<00:10, 5698190.07it/s]
 65%|██████▌   | 111116288/170498071 [00:19<00:10, 5873593.17it/s]
 66%|██████▌   | 111771648/170498071 [00:19<00:09, 6036367.94it/s]
 66%|██████▌   | 112459776/170498071 [00:19<00:09, 6198422.17it/s]
 66%|██████▋   | 113147904/170498071 [00:19<00:09, 6369574.98it/s]
 67%|██████▋   | 113836032/170498071 [00:19<00:08, 6496608.60it/s]
 67%|██████▋   | 114524160/170498071 [00:19<00:08, 6560420.22it/s]
 68%|██████▊   | 115212288/170498071 [00:19<00:08, 6621334.99it/s]
 68%|██████▊   | 115900416/170498071 [00:19<00:08, 6643293.06it/s]
 68%|██████▊   | 116588544/170498071 [00:20<00:08, 6652943.76it/s]
 69%|██████▉   | 117276672/170498071 [00:20<00:07, 6682395.33it/s]
 69%|██████▉   | 117964800/170498071 [00:20<00:07, 6695854.97it/s]
 70%|██████▉   | 118652928/170498071 [00:20<00:07, 6688831.70it/s]
 70%|██████▉   | 119341056/170498071 [00:20<00:07, 6701220.94it/s]
 70%|███████   | 120029184/170498071 [00:20<00:07, 6715123.52it/s]
 71%|███████   | 120717312/170498071 [00:20<00:07, 6690772.53it/s]
 71%|███████   | 121405440/170498071 [00:20<00:07, 6688035.40it/s]
 72%|███████▏  | 122093568/170498071 [00:20<00:07, 6621205.37it/s]
 72%|███████▏  | 122781696/170498071 [00:20<00:07, 6671073.60it/s]
 72%|███████▏  | 123469824/170498071 [00:21<00:07, 6701231.34it/s]
 73%|███████▎  | 124157952/170498071 [00:21<00:07, 6555138.21it/s]
 73%|███████▎  | 124846080/170498071 [00:21<00:06, 6528723.35it/s]
 74%|███████▎  | 125501440/170498071 [00:21<00:06, 6518977.31it/s]
 74%|███████▍  | 126156800/170498071 [00:21<00:06, 6489923.60it/s]
 74%|███████▍  | 126812160/170498071 [00:21<00:06, 6444535.85it/s]
 75%|███████▍  | 127467520/170498071 [00:21<00:06, 6430674.29it/s]
 75%|███████▌  | 128122880/170498071 [00:21<00:06, 6412331.46it/s]
 76%|███████▌  | 128778240/170498071 [00:21<00:06, 6442109.69it/s]
 76%|███████▌  | 129564672/170498071 [00:21<00:05, 6831212.82it/s]
 76%|███████▋  | 130351104/170498071 [00:22<00:05, 7077274.30it/s]
 77%|███████▋  | 131170304/170498071 [00:22<00:05, 7308479.20it/s]
 77%|███████▋  | 131956736/170498071 [00:22<00:05, 7458141.09it/s]
 78%|███████▊  | 132743168/170498071 [00:22<00:05, 7513629.82it/s]
 78%|███████▊  | 133529600/170498071 [00:22<00:04, 7519104.90it/s]
 79%|███████▉  | 134316032/170498071 [00:22<00:04, 7558396.81it/s]
 79%|███████▉  | 135102464/170498071 [00:22<00:04, 7486206.78it/s]
 80%|███████▉  | 135856128/170498071 [00:22<00:04, 7402543.31it/s]
 80%|████████  | 136609792/170498071 [00:22<00:04, 7220873.53it/s]
 81%|████████  | 137363456/170498071 [00:23<00:04, 7106498.95it/s]
 81%|████████  | 138149888/170498071 [00:23<00:04, 7317566.36it/s]
 81%|████████▏ | 138903552/170498071 [00:23<00:04, 7298261.44it/s]
 82%|████████▏ | 139657216/170498071 [00:23<00:04, 7199425.26it/s]
 82%|████████▏ | 140410880/170498071 [00:23<00:04, 7230570.38it/s]
 83%|████████▎ | 141164544/170498071 [00:23<00:04, 7193010.41it/s]
 83%|████████▎ | 141885440/170498071 [00:23<00:03, 7166157.75it/s]
 84%|████████▎ | 142639104/170498071 [00:23<00:03, 7177358.25it/s]
 84%|████████▍ | 143360000/170498071 [00:23<00:03, 7166277.92it/s]
 85%|████████▍ | 144080896/170498071 [00:23<00:03, 7092343.52it/s]
 85%|████████▍ | 144834560/170498071 [00:24<00:03, 7152713.26it/s]
 85%|████████▌ | 145555456/170498071 [00:24<00:03, 7116548.82it/s]
 86%|████████▌ | 146276352/170498071 [00:24<00:03, 7133565.50it/s]
 86%|████████▌ | 146997248/170498071 [00:24<00:03, 7124548.51it/s]
 87%|████████▋ | 147718144/170498071 [00:24<00:03, 7115027.78it/s]
 87%|████████▋ | 148471808/170498071 [00:24<00:03, 7108681.50it/s]
 88%|████████▊ | 149225472/170498071 [00:24<00:02, 7176482.80it/s]
 88%|████████▊ | 149946368/170498071 [00:24<00:02, 7117350.27it/s]
 88%|████████▊ | 150667264/170498071 [00:24<00:02, 7100697.34it/s]
 89%|████████▉ | 151388160/170498071 [00:24<00:02, 7108277.67it/s]
 89%|████████▉ | 152109056/170498071 [00:25<00:02, 7137476.68it/s]
 90%|████████▉ | 152829952/170498071 [00:25<00:02, 7111839.77it/s]
 90%|█████████ | 153550848/170498071 [00:25<00:02, 7137401.30it/s]
 91%|█████████ | 154304512/170498071 [00:25<00:02, 7109160.28it/s]
 91%|█████████ | 155058176/170498071 [00:25<00:02, 7167925.31it/s]
 91%|█████████▏| 155779072/170498071 [00:25<00:02, 7131398.97it/s]
 92%|█████████▏| 156499968/170498071 [00:25<00:01, 7141994.64it/s]
 92%|█████████▏| 157220864/170498071 [00:25<00:01, 7115745.45it/s]
 93%|█████████▎| 157974528/170498071 [00:25<00:01, 7172963.03it/s]
 93%|█████████▎| 158793728/170498071 [00:26<00:01, 7390704.14it/s]
 94%|█████████▎| 159711232/170498071 [00:26<00:01, 7863310.38it/s]
 94%|█████████▍| 160595968/170498071 [00:26<00:01, 8088618.70it/s]
 95%|█████████▍| 161480704/170498071 [00:26<00:01, 8292393.80it/s]
 95%|█████████▌| 162398208/170498071 [00:26<00:00, 8489843.43it/s]
 96%|█████████▌| 163381248/170498071 [00:26<00:00, 8830977.01it/s]
 96%|█████████▋| 164331520/170498071 [00:26<00:00, 9006641.39it/s]
 97%|█████████▋| 165314560/170498071 [00:26<00:00, 9239831.64it/s]
 98%|█████████▊| 166264832/170498071 [00:26<00:00, 9313592.54it/s]
 98%|█████████▊| 167247872/170498071 [00:26<00:00, 9451671.13it/s]
 99%|█████████▊| 168198144/170498071 [00:27<00:00, 9426893.87it/s]
 99%|█████████▉| 169148416/170498071 [00:27<00:00, 9443632.39it/s]
100%|█████████▉| 170098688/170498071 [00:27<00:00, 9444166.96it/s]
100%|██████████| 170498071/170498071 [00:27<00:00, 6250071.81it/s]Extracting ./data_0/cifar-10-python.tar.gz to ./data_0
Files already downloaded and verified

/usr/local/lib/python3.8/dist-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
  warnings.warn(
/usr/local/lib/python3.8/dist-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet50_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet50_Weights.DEFAULT` to get the most up-to-date weights.
  warnings.warn(msg)
CCL INFO Connected all trees
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO NCCL_P2P_PXN_LEVEL set by environment to 0.
c2e1a9b253bb44159e96b7b822016fac000000:38:214 [0] NCCL INFO comm 0x2aecc7e0 rank 0 nranks 16 cudaDev 0 busId 100000 - Init COMPLETE
[Epoch 1] loss: 552.524
[Epoch 2] loss: 432.076
[Epoch 3] loss: 368.724
[Epoch 4] loss: 382.397
[Epoch 5] loss: 334.612
[Epoch 6] loss: 328.634
[Epoch 7] loss: 284.240
[Epoch 8] loss: 333.176
[Epoch 9] loss: 318.735
2023/05/23 15:58:00 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /tmp/tmpkdzv7ohy/model/data, flavor: pytorch), fall back to return ['torch==1.13.0', 'cloudpickle==2.2.0']. Set logging level to DEBUG to see the full traceback.
[Epoch 10] loss: 292.394
Traceback (most recent call last):
  File "train.py", line 162, in <module>
    outputs = model(inputs)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 1034, in forward
    self._sync_buffers()
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 1621, in _sync_buffers
    self._sync_module_buffers(authoritative_rank)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 1625, in _sync_module_buffers
    self._default_broadcast_coalesced(authoritative_rank=authoritative_rank)
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 1646, in _default_broadcast_coalesced
    self._distributed_broadcast_coalesced(
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 1562, in _distributed_broadcast_coalesced
    dist._broadcast_coalesced(
RuntimeError: ProcessGroupWrapper: Monitored Barrier encountered error running collective: CollectiveFingerPrint(OpType=BROADCAST, TensorShape=[53120], TensorDtypes=Float, TensorDeviceTypes=TensorOptions(dtype=float (default), device=cuda, layout=Strided (default), requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))). Error: 
[../third_party/gloo/gloo/transport/tcp/pair.cc:598] Connection closed by peer [10.0.0.4]:48299
c2e1a9b253bb44159e96b7b822016fac000000:38:222 [0] NCCL INFO [Service thread] Connection closed by localRank 0
c2e1a9b253bb44159e96b7b822016fac000000:38:38 [0] NCCL INFO comm 0x2aecc7e0 rank 0 nranks 16 cudaDev 0 busId 100000 - Abort COMPLETE