Incorrect results for loss and accuracy in DDP

I have this code for CIFAR10 on pretrained ResNet50. How do I fix the loss, running_loss, and accuracy for the DDP?

import time
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import mlflow
import os
import datetime

import configparser
import logging
import argparse

from PIL import Image

import ssl
ssl._create_default_https_context = ssl._create_unverified_context


start_time = time.time()

torch.backends.cudnn.benchmark=False
torch.backends.cudnn.deterministic=True


print("NCCL version is: ", torch.cuda.nccl.version())
print("MLflow version:", mlflow.__version__)
print("Tracking URI:", mlflow.get_tracking_uri())
print("Artifact URI:", mlflow.get_artifact_uri())

# Set the seed for reproducibility
torch.manual_seed(42)

# Set up the data loading parameters
batch_size = 128
num_epochs = 10
num_workers = 4
pin_memory = True

# Get the world size and rank to determine the process group
world_size = int(os.environ['WORLD_SIZE'])
world_rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])

print("World size:", world_size)
print("local rank is {} and world rank is {}".format(local_rank, world_rank))

is_distributed = world_size > 1

if is_distributed:
    batch_size = batch_size // world_size
    batch_size = max(batch_size, 1)

# Set the backend to NCCL for distributed training
dist.init_process_group(backend="nccl",
                        init_method="env://",
                        world_size=world_size,
                        rank=world_rank)

# Set the device to the current local rank
torch.cuda.set_device(local_rank)
device = torch.device('cuda', local_rank)

dist.barrier()

# Define the transforms for the dataset
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
])

# Load the CIFAR-10 dataset

data_root = './data_' + str(world_rank)
train_dataset = torchvision.datasets.CIFAR10(root=data_root, train=True, download=True, transform=transform_train)
train_sampler = torch.utils.data.distributed.DistributedSampler(dataset=train_dataset, num_replicas=world_size, rank=world_rank, shuffle=True) if is_distributed else None
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=(train_sampler is None), num_workers=num_workers, pin_memory=pin_memory, sampler=train_sampler)

test_dataset = torchvision.datasets.CIFAR10(root=data_root, train=False, download=True, transform=transform_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)

# Define the ResNet50 model
model = torchvision.models.resnet50(pretrained=True)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 10)

# Move the model to the GPU
model = model.to(device)

# Wrap the model with DistributedDataParallel
if is_distributed:
    model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model for the specified number of epochs
for epoch in range(num_epochs):
    running_loss = 0.0
    train_sampler.set_epoch(epoch) ### why is this line necessary??
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()

    print('[Epoch %d] loss: %.3f' % (epoch + 1, running_loss))
    if world_rank == 0:
        # Log the loss and running loss as MLFlow metrics
        mlflow.log_metric("loss", loss.item())
        mlflow.log_metric("running loss", running_loss)

dist.barrier()
# Save the trained model
if world_rank == 0:
    checkpoints_path = "train_checkpoints"
    os.makedirs(checkpoints_path, exist_ok=True)
    torch.save(model.state_dict(), '{}/{}-{}.pth'.format(checkpoints_path, 'resnet50_cifar10', world_rank))
    mlflow.pytorch.log_model(model, "resnet50_cifar10_{}.pth".format(world_rank))
    # mlflow.log_artifact('{}/{}-{}.pth'.format(checkpoints_path, 'resnet50_cifar10', world_rank), artifact_path="model_state_dict")

# Evaluate the model on the test set and save inference on 6 random images
correct = 0
total = 0
with torch.no_grad():
    fig, axs = plt.subplots(2, 3, figsize=(8, 6), dpi=100)
    axs = axs.flatten()
    count = 0
    for data in test_loader:
        if count == 6:
            break
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Save the inference on the 6 random images
        if count < 6:
            image = np.transpose(inputs[0].cpu().numpy(), (1, 2, 0))
            confidence = torch.softmax(outputs, dim=1)[0][predicted[0]].cpu().numpy()
            class_name = test_dataset.classes[predicted[0]]
            axs[count].imshow(image)
            axs[count].set_title(f'Class: {class_name}\nConfidence: {confidence:.2f}')
            axs[count].axis('off')
            count += 1

# Average the test accuracy across all processes

correct = torch.tensor(correct, dtype=torch.int8)
correct = correct.to(device)
torch.distributed.all_reduce(correct, op=torch.distributed.ReduceOp.SUM)
total = torch.tensor(total, dtype=torch.torch.int8)
total = total.to(device)
torch.distributed.all_reduce(total, op=torch.distributed.ReduceOp.SUM)
test_accuracy = 100 * correct / total
test_accuracy /= world_size

print('Test accuracy: %.2f %%' % test_accuracy)

# Save the plot with the 6 random images and their predicted classes and prediction confidence
test_img_file_name = 'test_images_' + str(world_rank) + '.png'
plt.savefig(test_img_file_name)

# Log the test accuracy and elapsed time to MLflow
if world_rank == 0:
    mlflow.log_metric("test accuracy", test_accuracy)

end_time = time.time()
elapsed_time = end_time - start_time
print('Elapsed time: ', elapsed_time)
if world_rank == 0:
    mlflow.log_metric("elapsed time", elapsed_time)

# Save the plot with the 6 random images and their predicted classes and prediction confidence as an artifact in MLflow
image = Image.open(test_img_file_name)
image = image.convert('RGBA')
image_buffer = np.array(image)
image_buffer = image_buffer[:, :, [2, 1, 0, 3]]
image_buffer = np.ascontiguousarray(image_buffer)
artifact_file_name = "inference_on_test_images_" + str(world_rank) + ".png"
mlflow.log_image(image_buffer, artifact_file=artifact_file_name)

# End the MLflow run
if mlflow.active_run():
    mlflow.end_run()

dist.destroy_process_group()

I get these values which are wrong:

values reported for world_rank : 2

c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO Bootstrap : Using eth0:10.0.0.4<0>
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO Plugin Path : /opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO P2P plugin IBext
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO NCCL_IB_PCI_RELAXED_ORDERING set by environment to 1.
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO NET/IB : No device found.
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO NET/Socket : Using [0]eth0:10.0.0.4<0>
c2b637b5f8b249148c78a16c718b30fb000000:43:43 [2] NCCL INFO Using network Socket
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0001-0000-3130-444531303244/pci0001:00/0001:00:00.0/../max_link_speed, ignoring
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0001-0000-3130-444531303244/pci0001:00/0001:00:00.0/../max_link_width, ignoring
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0002-0000-3130-444531303244/pci0002:00/0002:00:00.0/../max_link_speed, ignoring
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0002-0000-3130-444531303244/pci0002:00/0002:00:00.0/../max_link_width, ignoring
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0003-0000-3130-444531303244/pci0003:00/0003:00:00.0/../max_link_speed, ignoring
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0003-0000-3130-444531303244/pci0003:00/0003:00:00.0/../max_link_width, ignoring
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0004-0000-3130-444531303244/pci0004:00/0004:00:00.0/../max_link_speed, ignoring
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Topology detection : could not read /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/47505500-0004-0000-3130-444531303244/pci0004:00/0004:00:00.0/../max_link_width, ignoring
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Topology detection: network path /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A03:00/device:07/VMBUS:01/6045bd78-a988-6045-bd78-a9886045bd78 is not a PCI device (vmbus). Attaching to first CPU
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO KV Convert to int : could not find value of '' in dictionary, falling back to 60
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Attribute coll of node net not found
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO === System : maxWidth 12.0 totalWidth 12.0 ===
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO CPU/0 (1/1/1)
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO + PCI[5000.0] - NIC/0
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO + PCI[12.0] - GPU/100000 (0)
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO + PCI[12.0] - GPU/200000 (1)
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO + PCI[12.0] - GPU/300000 (2)
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO + PCI[12.0] - GPU/400000 (3)
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO ==========================================
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO GPU/100000 :GPU/100000 (0/5000.000000/LOC) GPU/200000 (2/12.000000/PHB) GPU/300000 (2/12.000000/PHB) GPU/400000 (2/12.000000/PHB) CPU/0 (1/12.000000/PHB) 
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO GPU/200000 :GPU/100000 (2/12.000000/PHB) GPU/200000 (0/5000.000000/LOC) GPU/300000 (2/12.000000/PHB) GPU/400000 (2/12.000000/PHB) CPU/0 (1/12.000000/PHB) 
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO GPU/300000 :GPU/100000 (2/12.000000/PHB) GPU/200000 (2/12.000000/PHB) GPU/300000 (0/5000.000000/LOC) GPU/400000 (2/12.000000/PHB) CPU/0 (1/12.000000/PHB) 
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO GPU/400000 :GPU/100000 (2/12.000000/PHB) GPU/200000 (2/12.000000/PHB) GPU/300000 (2/12.000000/PHB) GPU/400000 (0/5000.000000/LOC) CPU/0 (1/12.000000/PHB) 
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Pattern 4, crossNic 0, nChannels 1, speed 10.000000/10.000000, type PHB/PIX, sameChannels 1
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO  0 : GPU/0 GPU/1 GPU/2 GPU/3
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Pattern 1, crossNic 0, nChannels 1, speed 10.000000/10.000000, type PHB/PIX, sameChannels 1
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO  0 : GPU/0 GPU/1 GPU/2 GPU/3
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Pattern 3, crossNic 0, nChannels 1, speed 10.000000/10.000000, type PHB/PIX, sameChannels 1
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO  0 : GPU/0 GPU/1 GPU/2 GPU/3
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Ring 00 : 1 -> 2 -> 3
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Ring 01 : 1 -> 2 -> 3
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Setting affinity for GPU 2 to 0fff
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Channel 00 : 2[300000] -> 3[400000] via direct shared memory
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Channel 01 : 2[300000] -> 3[400000] via direct shared memory
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Connected all rings
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Channel 00 : 2[300000] -> 1[200000] via direct shared memory
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Channel 01 : 2[300000] -> 1[200000] via direct shared memory
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO Connected all trees
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
c2b637b5f8b249148c78a16c718b30fb000000:43:215 [2] NCCL INFO comm 0x148cb0001240 rank 2 nranks 4 cudaDev 2 busId 300000 - Init COMPLETE
MLflow version: 2.3.2
Tracking URI: azureml:URI
Artifact URI: azureml:URI
World size: 4
local rank is 2 and world rank is 2
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data_2/cifar-10-python.tar.gz

  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 544768/170498071 [00:00<00:31, 5412815.43it/s]
  etc
 99%|█████████▉| 169034752/170498071 [00:07<00:00, 22888159.56it/s]
170499072it [00:07, 23576583.80it/s]                               
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
Extracting ./data_2/cifar-10-python.tar.gz to ./data_2
Files already downloaded and verified

  0%|          | 0.00/97.8M [00:00<?, ?B/s]
  6%|▌         | 5.92M/97.8M [00:00<00:01, 62.1MB/s]
 20%|█▉        | 19.1M/97.8M [00:00<00:00, 104MB/s] 
 31%|███       | 30.3M/97.8M [00:00<00:00, 106MB/s]
 45%|████▍     | 44.0M/97.8M [00:00<00:00, 119MB/s]
 57%|█████▋    | 55.3M/97.8M [00:00<00:00, 118MB/s]
 70%|███████   | 68.9M/97.8M [00:00<00:00, 126MB/s]
 89%|████████▉ | 87.1M/97.8M [00:00<00:00, 147MB/s]
100%|██████████| 97.8M/97.8M [00:00<00:00, 130MB/s]
[Epoch 1] loss: 436.641
[Epoch 2] loss: 348.526
[Epoch 3] loss: 282.781
[Epoch 4] loss: 260.624
[Epoch 5] loss: 244.643
[Epoch 6] loss: 223.634
[Epoch 7] loss: 223.708
[Epoch 8] loss: 210.343
[Epoch 9] loss: 206.331
[Epoch 10] loss: 189.489
[Epoch 11] loss: 206.787
[Epoch 12] loss: 172.065
[Epoch 13] loss: 178.300
[Epoch 14] loss: 163.425
[Epoch 15] loss: 194.983
[Epoch 16] loss: 191.386
[Epoch 17] loss: 159.500
[Epoch 18] loss: 155.034
[Epoch 19] loss: 191.392
[Epoch 20] loss: 155.070
[Epoch 21] loss: 142.805
[Epoch 22] loss: 137.545
[Epoch 23] loss: 131.892
[Epoch 24] loss: 139.402
[Epoch 25] loss: 152.963
[Epoch 26] loss: 127.944
[Epoch 27] loss: 118.531
[Epoch 28] loss: 118.559
[Epoch 29] loss: 99.298
[Epoch 30] loss: 107.345
[Epoch 31] loss: 131.125
[Epoch 32] loss: 121.398
[Epoch 33] loss: 106.694
[Epoch 34] loss: 124.426
[Epoch 35] loss: 93.513
[Epoch 36] loss: 98.293
[Epoch 37] loss: 96.281
[Epoch 38] loss: 166.631
[Epoch 39] loss: 106.873
[Epoch 40] loss: 87.979
[Epoch 41] loss: 83.213
[Epoch 42] loss: 87.652
[Epoch 43] loss: 85.583
[Epoch 44] loss: 81.077
[Epoch 45] loss: 76.026
[Epoch 46] loss: 78.866
[Epoch 47] loss: 88.943
[Epoch 48] loss: 78.715
[Epoch 49] loss: 80.896
[Epoch 50] loss: 73.867
[Epoch 51] loss: 70.433
[Epoch 52] loss: 70.222
[Epoch 53] loss: 68.983
[Epoch 54] loss: 74.410
[Epoch 55] loss: 66.356
[Epoch 56] loss: 68.507
[Epoch 57] loss: 67.405
[Epoch 58] loss: 63.152
[Epoch 59] loss: 63.353
[Epoch 60] loss: 59.653
[Epoch 61] loss: 58.498
[Epoch 62] loss: 57.691
[Epoch 63] loss: 55.397
[Epoch 64] loss: 53.880
[Epoch 65] loss: 54.870
[Epoch 66] loss: 50.526
[Epoch 67] loss: 61.435
[Epoch 68] loss: 57.063
[Epoch 69] loss: 52.585

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.0.0.4<32820>
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO include/socket.h:445 -> 2
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO include/socket.h:457 -> 2
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:229 -> 2

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory'
c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] NCCL INFO bootstrap.cc:231 -> 1

c2b637b5f8b249148c78a16c718b30fb000000:43:220 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 73)
[Epoch 70] loss: 54.840
[Epoch 71] loss: 48.762
[Epoch 72] loss: 44.449
[Epoch 73] loss: 50.534
[Epoch 74] loss: 48.013
[Epoch 75] loss: 47.643
[Epoch 76] loss: 43.079
[Epoch 77] loss: 48.345
[Epoch 78] loss: 45.675
[Epoch 79] loss: 44.037
[Epoch 80] loss: 44.107
[Epoch 81] loss: 42.572
[Epoch 82] loss: 44.492
[Epoch 83] loss: 38.794
[Epoch 84] loss: 41.004
[Epoch 85] loss: 43.954
[Epoch 86] loss: 39.967
[Epoch 87] loss: 38.832
[Epoch 88] loss: 49.026
[Epoch 89] loss: 41.993
[Epoch 90] loss: 40.688
[Epoch 91] loss: 37.602
[Epoch 92] loss: 36.748
[Epoch 93] loss: 39.834
[Epoch 94] loss: 36.769
[Epoch 95] loss: 33.398
[Epoch 96] loss: 39.392
[Epoch 97] loss: 36.343
[Epoch 98] loss: 35.395
[Epoch 99] loss: 32.032
[Epoch 100] loss: 31.387
WARNING:urllib3.connectionpool:Retrying (Retry(total=4, connect=5, read=4, redirect=5, status=5)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='eastus2.api.azureml.ms', port=443): Read timed out. (read timeout=120)")': /mlflow/v2.0/subscriptions/9be1367a-bcc9-4275-8b3d-a0469f4119fa/resourceGroups/some_name/providers/Microsoft.MachineLearningServices/workspaces/some_name/api/2.0/mlflow/runs/get?run_uuid=fb2256b5-01e2-441c-b08a-33434e225b34&run_id=fb2256b5-01e2-441c-b08a-33434e225b34
Test accuracy: inf %
Elapsed time:  5352.941336393356