Hi,
I’m a beginner on PyTorch, and basically my code runs on my computer without problems, but now i’m using a machine with 2 GPU’s and i want to use them the best way possible. When I try to run the code it gives an error:
RuntimeError: Caught RuntimeError in replica 0 on device 0.
From what i’ve read in this forum there might be some problems with some things being on the cpu and others being on the GPU, but I can’t understand what I’m doing wrong here.
This is the code from my model:
class NeuralNetworkClassifier:
def __init__(self, model, criterion, optimizer, optimizer_config: dict, experiment) -> None:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model = model.to(self.device)
self.optimizer = optimizer(self.model.parameters(), **optimizer_config)
self.criterion = criterion
self.experiment = experiment
self.hyper_params = optimizer_config
self._start_epoch = 0
self.hyper_params["epochs"] = self._start_epoch
self.__num_classes = None
self._is_parallel = False
if torch.cuda.device_count() > 1:
self.model = nn.DataParallel(self.model)
self._is_parallel = True
notice = "Running on {} GPUs.".format(torch.cuda.device_count())
print("\033[33m" + notice + "\033[0m")
def fit(self, loader: Dict[str, DataLoader], epochs: int, checkpoint_path: str = None, validation: bool = True) -> None:
len_of_train_dataset = len(loader["train"].dataset)
epochs = epochs + self._start_epoch
self.hyper_params["epochs"] = epochs
self.hyper_params["batch_size"] = loader["train"].batch_size
self.hyper_params["train_ds_size"] = len_of_train_dataset
if validation:
len_of_val_dataset = len(loader["val"].dataset)
self.hyper_params["val_ds_size"] = len_of_val_dataset
self.experiment.log_parameters(self.hyper_params)
for epoch in range(self._start_epoch, epochs):
if checkpoint_path is not None and epoch % 100 == 0:
self.save_to_file(checkpoint_path)
with self.experiment.train():
correct = 0.0
total = 0.0
self.model.train()
pbar = tqdm.tqdm(total=len_of_train_dataset)
for x, y in loader["train"]:
b_size = y.shape[0]
total += y.shape[0]
x = x.to(self.device) if isinstance(x, torch.Tensor) else [i.to(self.device) for i in x]
y = y.to(self.device)
pbar.set_description(
"\033[36m" + "Training" + "\033[0m" + " - Epochs: {:03d}/{:03d}".format(epoch+1, epochs)
)
pbar.update(b_size)
self.optimizer.zero_grad()
outputs = self.model(x)
loss = self.criterion(outputs, y)
loss.backward()
self.optimizer.step()
_, predicted = torch.max(outputs, 1)
correct += (predicted == y).sum().float().cpu().item()
self.experiment.log_metric("loss", loss.cpu().item(), step=epoch)
self.experiment.log_metric("accuracy", float(correct / total), step=epoch)
if validation:
with self.experiment.validate():
with torch.no_grad():
val_correct = 0.0
val_total = 0.0
self.model.eval()
for x_val, y_val in loader["val"]:
val_total += y_val.shape[0]
x_val = x_val.to(self.device) if isinstance(x_val, torch.Tensor) else [i_val.to(self.device) for i_val in x_val]
y_val = y_val.to(self.device)
val_output = self.model(x_val)
val_loss = self.criterion(val_output, y_val)
_, val_pred = torch.max(val_output, 1)
val_correct += (val_pred == y_val).sum().float().cpu().item()
self.experiment.log_metric("loss", val_loss.cpu().item(), step=epoch)
self.experiment.log_metric("accuracy", float(val_correct / val_total), step=epoch)
pbar.close()
This is the code I use to run the model and the error I get in the end:
in_feature = 37
seq_len = 10
n_heads = 32
factor = 32
num_class = 10
num_layers = 6
clf = NeuralNetworkClassifier(
SAnD(in_feature, seq_len, n_heads, factor, num_class, num_layers),
nn.CrossEntropyLoss(),
optim.Adam, optimizer_config={"lr": 1e-5, "betas": (0.9, 0.98), "eps": 4e-09, "weight_decay": 5e-4},
experiment=teste
)
#data is a 3D array with shape: (6765, 10, 37)
#feature_target is an array with length of 6765
x_train, x_test, y_train, y_test = train_test_split(data, feature_target, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
val_dataset = torch.from_numpy(x_val)
test_dataset = torch.from_numpy(x_test)
train_dataset = torch.from_numpy(x_train)
y_train = torch.from_numpy(y_train)
y_val = torch.from_numpy(y_val)
y_test = torch.from_numpy(y_test)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
val_dataset,test_dataset,train_dataset=val_dataset.to(device,dtype=torch.float),test_dataset.to(device,dtype=torch.float),train_dataset.to(device,dtype=torch.float)
y_train = torch.tensor(y_train, dtype=torch.long, device=device)
y_val = torch.tensor(y_val, dtype=torch.long, device=device)
y_test = torch.tensor(y_test, dtype=torch.long, device=device)
train_ds = TensorDataset(train_dataset, y_train)
val_ds = TensorDataset(val_dataset, y_val)
test_ds = TensorDataset(test_dataset, y_test)
train_loader = DataLoader(train_ds, batch_size=128)
val_loader = DataLoader(val_ds, batch_size=128)
test_loader = DataLoader(test_ds, batch_size=128)
# training network
clf.fit(
{"train": train_loader,
"val": val_loader},
epochs=100
)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/pdomingues/Desktop/SAnD/core/model.py", line 49, in forward
x = self.encoder(x)
File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/pdomingues/Desktop/SAnD/core/model.py", line 25, in forward
x = l(x)
File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/pdomingues/Desktop/SAnD/core/modules.py", line 84, in forward
x = self.attention(x)
File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/pdomingues/Desktop/SAnD/core/modules.py", line 45, in forward
output, self.attn_weights = self.layer(src, src, src)
File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/activation.py", line 980, in forward
return F.multi_head_attention_forward(
File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/functional.py", line 4633, in multi_head_attention_forward
q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/functional.py", line 1753, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
What am I doing wrong?
And also, since I want to get the most out of my 2 GPUs should I be using DistributedDataParallel
instead? If so, do I need to produce major changes in my code?