Distributed app examples is crashing with error
I have pc with 2 rtx2070s on which cuda stand-alone application runs just fine. But distributed fails:
python3 ex1.py
Let’s use 2 GPUs!
/home/nonroot/.local/lib/python3.9/site-packages/torch/nn/modules/linear.py:125: UserWarning: Attempting to run cuBLAS, but there was no current CUDA context! Attempting to set the primary context… (Triggered internally at …/aten/src/ATen/cuda/CublasHandlePool.cpp:135.)
url and sample code from torch itself:
# https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
import sys
sys.path.append('..')
#from classes import *
DEBUG = 0
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# Parameters and DataLoaders
input_size = 1000
output_size = 10
batch_size = 1000
data_size = 60000
if not torch.cuda.is_available():
print("GPU is not detected.")
quit(1)
device = torch.device("cuda:0")
class RandomDataset(Dataset):
DEBUG = 0
DEBUGL2 = 0
def __init__(self, size, length):
if self.DEBUG:
print("RandomDataset.__init__(size=", size, "length: ", length, ")")
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
if self.DEBUGL2:
print("RandomDataset.__getitem__(index=", index, ")")
return self.data[index]
def __len__(self):
if self.DEBUG:
print("RandomDataset.__len__() returning self.len: ", self.len)
return len(self.data)
# Create random data set: input size = 1k, data_size = 60k, batch_size: 1k.
rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
batch_size=batch_size, shuffle=True)
model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
model = nn.DataParallel(model)
model.to(device)
for data in rand_loader:
input = data.to(device)
output = model(input)
if DEBUG:
print("Outside: input size", input.size(), "output_size", output.size())
cuda environment:
nvidia-smi
Mon Nov 25 22:02:10 2024
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01 Driver Version: 565.57.01 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 2070 Off | 00000000:01:00.0 On | N/A |
| 41% 33C P8 6W / 185W | 531MiB / 8192MiB | 4% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA GeForce RTX 2070 ... Off | 00000000:02:00.0 Off | N/A |
| 41% 28C P8 4W / 215W | 2MiB / 8192MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 4019 G /usr/libexec/Xorg 127MiB |
| 0 N/A N/A 5580 G /usr/bin/gnome-shell 48MiB |
| 0 N/A N/A 6805 G /usr/lib64/firefox/firefox 23MiB |
| 0 N/A N/A 8665 G /usr/lib64/firefox/firefox 191MiB |
| 0 N/A N/A 9447 G /usr/lib64/firefox/firefox 136MiB |
+-----------------------------------------------------------------------------------------+
[guyen@localhost 1-dataparallellism]$ sudo dkms status
nvidia/565.57.01, 5.14.0-407.el9.x86_64, x86_64: installed
sudo yum list installed | grep -i cuda | grep -i toolkit
cuda-toolkit-12-2.x86_64 12.2.2-1 @cuda-rhel9-x86_64
cuda-toolkit-12-2-config-common.noarch 12.2.140-1 @cuda-rhel9-x86_64
cuda-toolkit-12-config-common.noarch 12.6.77-1 @cuda-rhel9-x86_64
cuda-toolkit-config-common.noarch 12.6.77-1 @cuda-rhel9-x86_64