Problems running my code on a machine with 2 GPUs, using DataParallel

I’m a beginner on PyTorch, and basically my code runs on my computer without problems, but now i’m using a machine with 2 GPU’s and i want to use them the best way possible. When I try to run the code it gives an error:

RuntimeError: Caught RuntimeError in replica 0 on device 0.

From what i’ve read in this forum there might be some problems with some things being on the cpu and others being on the GPU, but I can’t understand what I’m doing wrong here.

This is the code from my model:

class NeuralNetworkClassifier:
      def __init__(self, model, criterion, optimizer, optimizer_config: dict, experiment) -> None:
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model =
        self.optimizer = optimizer(self.model.parameters(), **optimizer_config)
        self.criterion = criterion
        self.experiment = experiment

        self.hyper_params = optimizer_config
        self._start_epoch = 0
        self.hyper_params["epochs"] = self._start_epoch
        self.__num_classes = None
        self._is_parallel = False

        if torch.cuda.device_count() > 1:
            self.model = nn.DataParallel(self.model)
            self._is_parallel = True

            notice = "Running on {} GPUs.".format(torch.cuda.device_count())
            print("\033[33m" + notice + "\033[0m")

    def fit(self, loader: Dict[str, DataLoader], epochs: int, checkpoint_path: str = None, validation: bool = True) -> None:
        len_of_train_dataset = len(loader["train"].dataset)
        epochs = epochs + self._start_epoch

        self.hyper_params["epochs"] = epochs
        self.hyper_params["batch_size"] = loader["train"].batch_size
        self.hyper_params["train_ds_size"] = len_of_train_dataset

        if validation:
            len_of_val_dataset = len(loader["val"].dataset)
            self.hyper_params["val_ds_size"] = len_of_val_dataset


        for epoch in range(self._start_epoch, epochs):
            if checkpoint_path is not None and epoch % 100 == 0:
            with self.experiment.train():
                correct = 0.0
                total = 0.0

                pbar = tqdm.tqdm(total=len_of_train_dataset)
                for x, y in loader["train"]:
                    b_size = y.shape[0]
                    total += y.shape[0]
                    x = if isinstance(x, torch.Tensor) else [ for i in x]
                    y =

                        "\033[36m" + "Training" + "\033[0m" + " - Epochs: {:03d}/{:03d}".format(epoch+1, epochs)

                    outputs = self.model(x)
                    loss = self.criterion(outputs, y)

                    _, predicted = torch.max(outputs, 1)
                    correct += (predicted == y).sum().float().cpu().item()

                    self.experiment.log_metric("loss", loss.cpu().item(), step=epoch)
                    self.experiment.log_metric("accuracy", float(correct / total), step=epoch)
            if validation:
                with self.experiment.validate():
                    with torch.no_grad():
                        val_correct = 0.0
                        val_total = 0.0

                        for x_val, y_val in loader["val"]:
                            val_total += y_val.shape[0]
                            x_val = if isinstance(x_val, torch.Tensor) else [ for i_val in x_val]
                            y_val =

                            val_output = self.model(x_val)
                            val_loss = self.criterion(val_output, y_val)
                            _, val_pred = torch.max(val_output, 1)
                            val_correct += (val_pred == y_val).sum().float().cpu().item()

                            self.experiment.log_metric("loss", val_loss.cpu().item(), step=epoch)
                            self.experiment.log_metric("accuracy", float(val_correct / val_total), step=epoch)


This is the code I use to run the model and the error I get in the end:

in_feature = 37
seq_len = 10
n_heads = 32
factor = 32
num_class = 10
num_layers = 6

clf = NeuralNetworkClassifier(
    SAnD(in_feature, seq_len, n_heads, factor, num_class, num_layers),
    optim.Adam, optimizer_config={"lr": 1e-5, "betas": (0.9, 0.98), "eps": 4e-09, "weight_decay": 5e-4},

#data is a 3D array with shape: (6765, 10, 37)
#feature_target is an array with length of 6765

x_train, x_test, y_train, y_test = train_test_split(data, feature_target, test_size=0.2)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

val_dataset = torch.from_numpy(x_val)
test_dataset = torch.from_numpy(x_test)
train_dataset = torch.from_numpy(x_train)

y_train = torch.from_numpy(y_train)
y_val = torch.from_numpy(y_val)
y_test = torch.from_numpy(y_test)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


y_train = torch.tensor(y_train, dtype=torch.long, device=device)
y_val = torch.tensor(y_val, dtype=torch.long, device=device)
y_test = torch.tensor(y_test, dtype=torch.long, device=device)

train_ds = TensorDataset(train_dataset, y_train)
val_ds = TensorDataset(val_dataset, y_val)
test_ds = TensorDataset(test_dataset, y_test)

train_loader = DataLoader(train_ds, batch_size=128)
val_loader = DataLoader(val_ds, batch_size=128)
test_loader = DataLoader(test_ds, batch_size=128)

# training network
    {"train": train_loader,
     "val": val_loader},
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/parallel/", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/pdomingues/Desktop/SAnD/core/", line 49, in forward
    x = self.encoder(x)
  File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/pdomingues/Desktop/SAnD/core/", line 25, in forward
    x = l(x)
  File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/pdomingues/Desktop/SAnD/core/", line 84, in forward
    x = self.attention(x)
  File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/pdomingues/Desktop/SAnD/core/", line 45, in forward
    output, self.attn_weights = self.layer(src, src, src)
  File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/modules/", line 980, in forward
    return F.multi_head_attention_forward(
  File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/", line 4633, in multi_head_attention_forward
    q, k, v = linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
  File "/home/pdomingues/anaconda3/lib/python3.8/site-packages/torch/nn/", line 1753, in linear
    return torch._C._nn.linear(input, weight, bias)
RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

What am I doing wrong?
And also, since I want to get the most out of my 2 GPUs should I be using DistributedDataParallel instead? If so, do I need to produce major changes in my code?

The cublas error might be raised, if you are running out of memory or if cublas runs indeed into an internal error.
If you made sure to have enough memory and reducing the batch size doesn’t solve it, you could create the cublas logs and we can check them.

Yes, you should use DDP for the best performance. This tutorial gives you an overview.