RuntimeError: arguments are located on different GPUs at /pytorch/aten/src/THC/generic/THCTensorIndex.cu:403

Hi there,

I have a Pytorch Bert model was originally trained with 1 GPU. Now i moved it to 4GPU due to memory issue. However, when I moved it to 4GPU. I got an error message as below during the validation part:

RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
    output.reraise()
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/_utils.py", line 395, in reraise
    raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
    output = module(*input, **kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ec2-user/SageMaker/Anecdotes/model.py", line 117, in forward
    inputs_embeds=None,
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/transformers/modeling_bert.py", line 727, in forward
    input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/transformers/modeling_bert.py", line 174, in forward
    inputs_embeds = self.word_embeddings(input_ids)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/sparse.py", line 114, in forward
    self.norm_type, self.scale_grad_by_freq, self.sparse)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/functional.py", line 1724, in embedding
    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: arguments are located on different GPUs at /pytorch/aten/src/THC/generic/THCTensorIndex.cu:403

My model.py is as below:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import MultiheadAttention, EmbeddingBag, CrossEntropyLoss, MultiLabelSoftMarginLoss, BCEWithLogitsLoss
from transformers import BertPreTrainedModel, BertModel, BertTokenizer, AdamW

from utils import LABEL_NAME


class ReviewClassification(BertPreTrainedModel):
    def __init__(self, config,
                 add_agent_text, agent_text_heads):
        """
        :param config: Bert configuration, can set up some parameters, like  output_attention, output_hidden_states
        :param add_agent_text: whether to use the non text feature, and how.
                It can have three options: None, "concat" and "attention"
        :param agent_text_heads: number of the heads in agent attention mechanism. Only useful if add_agent_text are set to
                "attention"
        """
        super().__init__(config)
        # self.num_labels = 2
        self.add_agent_text = add_agent_text

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        embedding_size = config.hidden_size

        if self.add_agent_text == "concat":
            embedding_size = 2 * embedding_size
        elif self.add_agent_text == "attention":
            self.agent_attention = nn.MultiheadAttention(embedding_size, num_heads=agent_text_heads)
        else:
            # don't use the information in Agent text
            pass

        self.classifier = nn.Linear(embedding_size, 1) # self.classifier = nn.Linear(embedding_size, len(LABEL_NAME)) # bias: If set to False, the layer will not learn an additive bias
        self.init_weights()

        print(
            """            
            add agent text         :{}
            agent text multi-head  :{}
            """.format(self.add_agent_text, agent_text_heads)
        )

    def forward(
            self,
            review_input_ids=None,
            review_attention_mask=None,
            review_token_type_ids=None,
            agent_input_ids=None,
            agent_attention_mask=None,
            agent_token_type_ids=None,
            labels=None,
    ):
        """
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import BertTokenizer, BertForSequenceClassification
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, logits = outputs[:2]

        """

        review_outputs = self.bert(
            review_input_ids,
            attention_mask=review_attention_mask,
            token_type_ids=review_token_type_ids,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
        )
        if self.add_agent_text is not None:
            # means that self.add_agent_text is "concat" or "attention"
            # TODO: we can try that agent_outputs do not share the same parameter
            agent_outputs = self.bert(
                agent_input_ids,
                attention_mask=agent_attention_mask,
                token_type_ids=agent_token_type_ids,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
            )

        if self.add_agent_text == "attention":
            review_hidden_states = review_outputs[0].transpose(0, 1)  # before trans: (bs, seq_len, hidden_size)
            agent_hidden_states = agent_outputs[0].mean(axis=1).unsqueeze(dim=0)  # (1, batch_size, hidden_size)

            attn_output, _ = self.agent_attention(agent_hidden_states, review_hidden_states, review_hidden_states)
            feature = attn_output.squeeze()  # (batch_size, seq_len)
        else:
            feature = review_outputs[1]  # (batch_size, seq_len) -? Should it be (batch_size, hidden_size)

        if self.add_agent_text == "concat":
            feature = torch.cat([feature, agent_outputs[1]], axis=1)
       

        # nn.CrossEntropyLoss applies F.log_softmax and nn.NLLLoss internally on your input,
        # so you should pass the raw logits to it.

        # torch.nn.functional.binary_cross_entropy takes logistic sigmoid values as inputs
        # torch.nn.functional.binary_cross_entropy_with_logits takes logits as inputs
        # torch.nn.functional.cross_entropy takes logits as inputs (performs log_softmax internally)
        # torch.nn.functional.nll_loss is like cross_entropy but takes log-probabilities (log-softmax) values as inputs

        # CrossEntropyLoss takes prediction logits (size: (N,D)) and target labels (size: (N,)) 
        # CrossEntropyLoss expects logits i.e whereas BCELoss expects probability value
        logits = self.classifier(feature).squeeze()

        outputs = (logits,)  # + outputs[2:]  # add hidden states and attention if they are here


        if labels is not None:
            ##### original
            # loss_fct = MultiLabelSoftMarginLoss()
            # loss = loss_fct(logits, labels)
            # outputs = (loss,) + outputs
            #### Version 1 try
            # pos_weight = dataset.label_proportion.iloc[0]/dataset.label_proportion.iloc[1]

            # Version 1.1 for weight
            # weight = torch.tensor([0.101521, 0.898479]) # hard code from entire training dataset
            # pos_weight = weight[labels.data.view(-1).long()].view_as(labels)
            # Version 1.2 for weight
            pos_weight=torch.tensor(1)
            # Version 1.3 for weight
            #weight = torch.tensor([1.0, 8.85]) # hard code from entire training dataset
            #pos_weight = weight[labels.data.view(-1).long()].view_as(labels)
            
            loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight).cuda() 
            loss = loss_fct(logits, labels)
            # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            outputs = (loss,) + outputs
            ### Version 2 try
            # loss_fct = nn.CrossEntropyLoss()
            # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            # outputs = (loss,) + outputs

        return outputs  # (loss, logits, hidden_states, attentions)

And my train_valid_test.py for the training, validation, test process is as below:

import time
import pickle
from path import Path
import numpy as np
import pandas as pd

from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix
import torch
import torch.nn as nn

from utils import LABEL_NAME, isnotebook, set_seed, format_time

if isnotebook():
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm


set_seed(seed=228)

def model_train(model, train_data_loader, valid_data_loader, test_data_loader,
                logger, optimizer, scheduler, num_epochs, seed, out_dir):
    # move model to gpu
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    num_gpus = torch.cuda.device_count()
    logger.info("Let's use {} GPUs!".format(num_gpus))

    # Set the seed value all over the place to make this reproducible.
#     set_seed(seed=seed)

    # We'll store a number of quantities such as training and validation loss,
    # validation accuracy, and timings.
    training_stats = []
    print_interval = 100

    # Measure the total training time for the whole run.
    total_t0 = time.time()
    batch_size = train_data_loader.batch_size
    num_batch = len(train_data_loader)
    best_f1_score = {
        "weighted": 0,
        "averaged": 0
    }
    best_test_f1_score = 0

    # For each epoch...
    for epoch_i in range(0, num_epochs):

        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.
        logger.info("")
        logger.info('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_epochs))
        logger.info('Training...')

        # Reset the total loss for this epoch.
        total_train_loss = 0

        # Measure how long the training epoch takes.
        t_train = time.time()

        model.train()

        # For each batch of training data...
        for step, batch in tqdm(enumerate(train_data_loader), desc="Training Iteration", total=num_batch):
            # Progress update every 100 batches.
            if step % print_interval == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t_train)
                avg_train_loss = total_train_loss / print_interval

                # Report progress.
                logger.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:.3e} | loss {:5.3f} | Elapsed {:s}'.format(
                    epoch_i+1, step, num_batch, scheduler.get_last_lr()[0], avg_train_loss, elapsed)
                )
                total_train_loss = 0
                training_stats.append(
                    {
                        'epoch': epoch_i + 1,
                        'step': step,
                        'train loss': avg_train_loss,
                    }
                )

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the
            # `to` method.
            #
            # `batch` contains four pytorch tensors:
            #   "input_ids"
            #   "attention_mask"
            #   "token_type_ids"
            #   "binarized_labels"

            b_review_input_ids = batch["review_input_ids"].to(device)
            b_review_attention_mask = batch["review_attention_mask"].to(device)
            b_review_token_type_ids = batch["review_token_type_ids"].to(device)
            b_agent_input_ids = batch["agent_input_ids"].to(device)
            b_agent_attention_mask = batch["agent_attention_mask"].to(device)
            b_agent_token_type_ids = batch["agent_token_type_ids"].to(device)

            b_binarized_label = batch["binarized_label"].to(device)

            model.zero_grad()
            (loss, _) = model(review_input_ids=b_review_input_ids,
                              review_attention_mask=b_review_attention_mask,
                              review_token_type_ids=b_review_token_type_ids,
                              agent_input_ids=b_agent_input_ids,
                              agent_attention_mask=b_agent_attention_mask,
                              agent_token_type_ids=b_agent_token_type_ids,

                              labels=b_binarized_label
                              )

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value
            # from the tensor.

            if num_gpus > 1:
                total_train_loss += loss.mean().item()
                loss.mean().backward()  # use loss.mean().backward() instead of loss.backward() for multiple gpu trainings
            else:
                total_train_loss += loss.item()
                loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            scheduler.step()
        # End of training epoch

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t_train)

        logger.info("")
        logger.info("  Training epoch took: {:s}".format(training_time))

        # evaluate the model after one epoch.

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        logger.info("")
        logger.info("Validating...")

        t_valid = time.time()
        model.eval()
        ave_valid_loss, valid_f1_table, cm_table, f1_score = model_validate(model=model, data_loader=valid_data_loader)
        # Measure how long this epoch took.
        validation_time = format_time(time.time() - t_valid)

        logger.info("")
        logger.info('| loss {:5.3f} | Elapsed {:s}'.format(ave_valid_loss, validation_time))
        logger.info("  \n{:s}".format(valid_f1_table.to_string()))
        logger.info("")
        logger.info("  \n{:s}".format(cm_table.to_string()))

        # need to store the best model
        for key in best_f1_score.keys():
            if best_f1_score[key] < f1_score[key]:
                # remove the old model:
                file_list = [f for f in out_dir.files() if f.name.endswith(".pt") and f.name.startswith(key)]
                for f in file_list:
                    Path.remove(f)
                model_file = out_dir.joinpath('{:s}_epoch_{:02d}-f1_{:.3f}.pt'.format(
                    key, epoch_i + 1, f1_score[key])
                )
                best_f1_score[key] = f1_score[key]
                if num_gpus > 1:
                    torch.save(model.module.state_dict(), model_file)
                else:
                    torch.save(model.state_dict(), model_file)

        # ========================================
        #               Test
        # ========================================
        logger.info("")
        logger.info("Testing...")

        result_df = model_test(model=model, data_loader=test_data_loader)
    
        y_true = np.array(result_df["review_label"], dtype=np.bool) # This part may need double check
        y_pred = result_df["Probability"] > 0.5

        report = classification_report(y_true, y_pred, output_dict=True)
        metrics_df = pd.DataFrame(report).transpose()

        metrics_df = metrics_df.sort_index()

        weighted_f1_score = metrics_df.loc['weighted avg', 'f1-score']
        averaged_f1_score = metrics_df.loc['macro avg', 'f1-score']

        best_test_f1_score = metrics_df.loc['weighted avg', 'f1-score'] \
            if best_test_f1_score < metrics_df.loc['weighted avg', 'f1-score'] else best_test_f1_score

        metrics_df = metrics_df.astype(float).round(3)

        # Calculate confusion matrix
        tn, fp, fn, tp  = confusion_matrix(y_true, y_pred).ravel()
        cm_df = pd.DataFrame(columns = ['Predicted No', 'Predicted Yes'],  
                       index = ['Actual No', 'Actual Yes']) 
        # adding rows to an empty  
        # dataframe at existing index 
        cm_df.loc['Actual No'] = [tn,fp] 
        cm_df.loc['Actual Yes'] = [fn,tp]
        
        logger.info("use model: {} batch / {} step".format(epoch_i + 1, step))
        logger.info("\n" + "=" * 50)
        logger.info("\n" + metrics_df.to_string())
        logger.info("\n" + "=" * 50)
        logger.info("\n" + cm_df.to_string())
        logger.info("best test F1 score: {}".format(best_test_f1_score))
        logger.info("\n" + "=" * 50)
        # Below is to save the result files
#         result_filename = "result_df_epoch_" + str(epoch_i + 1) + ".xlsx"
#         result_df.to_excel(out_dir.joinpath(result_filename), index=False)

    logger.info("")
    logger.info("Training complete!")
    logger.info("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))

    # Save training_stats to csv file
    pd.DataFrame(training_stats).to_csv(out_dir.joinpath("model_train.log"), index=False)
    return model, optimizer, scheduler


def model_validate(model, data_loader):
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    label_prop = data_loader.dataset.dataset.label_prop()

    total_valid_loss = 0

    batch_size = data_loader.batch_size
    num_batch = len(data_loader)

    y_pred, y_true = [], []

    # Evaluate data
    for step, batch in tqdm(enumerate(data_loader), desc="Validation...", total=num_batch):
        b_review_input_ids = batch["review_input_ids"].to(device)
        b_review_attention_mask = batch["review_attention_mask"].to(device)
        b_review_token_type_ids = batch["review_token_type_ids"].to(device)
        b_agent_input_ids = batch["agent_input_ids"].to(device)
        b_agent_attention_mask = batch["agent_attention_mask"].to(device)
        b_agent_token_type_ids = batch["agent_token_type_ids"].to(device)

        b_binarized_label = batch["binarized_label"].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            (loss, logits,) = model(review_input_ids=b_review_input_ids,
                                    review_attention_mask=b_review_attention_mask,
                                    review_token_type_ids=b_review_token_type_ids,
                                    agent_input_ids=b_agent_input_ids,
                                    agent_attention_mask=b_agent_attention_mask,
                                    agent_token_type_ids=b_agent_token_type_ids,

                                    labels=b_binarized_label)

        total_valid_loss += loss.item()
        ### The sigmoid function is used for the two-class logistic regression, 
        ### whereas the softmax function is used for the multiclass logistic regression
        
        # Version 1
        # numpy_probas = logits.detach().cpu().numpy()
        # y_pred.extend(np.argmax(numpy_probas, axis=1).flatten())
        # y_true.extend(b_binarized_label.cpu().numpy())

        # Version 2
        # transfored_logits = F.log_softmax(logits,dim=1)
        # numpy_probas = transfored_logits.detach().cpu().numpy()
        # y_pred.extend(np.argmax(numpy_probas, axis=1).flatten())
        # y_true.extend(b_binarized_label.cpu().numpy())

        # Version 3
        # transfored_logits = torch.sigmoid(logits)
        # numpy_probas = transfored_logits.detach().cpu().numpy()
        # y_pred.extend(np.argmax(numpy_probas, axis=1).flatten())
        # y_true.extend(b_binarized_label.cpu().numpy())

        # New version - for num_label = 1
        transfored_logits = torch.sigmoid(logits)
        numpy_probas = transfored_logits.detach().cpu().numpy()
        y_pred.extend(numpy_probas)
        y_true.extend(b_binarized_label.cpu().numpy())
        
    # End of an epoch of validation

    # put model to train mode again.
    model.train()

    ave_loss = total_valid_loss / (num_batch * batch_size)

    y_pred = np.array(y_pred)
    y_pred[y_pred < 0.5] = 0
    y_pred[y_pred >= 0.5] = 1
    
    # Below is in case the input and target are not the same data format
    y_pred = np.array(y_pred, dtype=np.bool)
    y_true = np.array(y_true, dtype=np.bool)
    
    
    # compute the various f1 score for each label
    report = classification_report(y_true, y_pred, output_dict=True)
    metrics_df = pd.DataFrame(report).transpose()
    # metrics_df = pd.DataFrame(0, index=LABEL_NAME, columns=["Precision", "Recall", "F1","support"])
    # metrics_df.Precision = precision_recall_fscore_support(y_true, y_pred)[0]
    # metrics_df.Recall = precision_recall_fscore_support(y_true, y_pred)[1]
    # metrics_df.F1 = precision_recall_fscore_support(y_true, y_pred)[2]
    # metrics_df.support = precision_recall_fscore_support(y_true, y_pred)[3]

    # y_pred = np.array(y_pred)
    # y_pred[y_pred < 0] = 0
    # y_pred[y_pred > 0] = 1
    # y_pred = np.array(y_pred, dtype=np.bool)
    # y_true = np.array(y_true, dtype=np.bool)

    # metrics_df = pd.DataFrame(0, index=LABEL_NAME, columns=["Precision", "Recall", "F1"], dtype=np.float)
    # # or_y_pred = np.zeros(y_pred.shape[0], dtype=np.bool)
    # # or_y_true = np.zeros(y_true.shape[0], dtype=np.bool)
    # for i in range(len(LABEL_NAME)):
    #     metrics_df.iloc[i] = precision_recall_fscore_support(
    #         y_true=y_true[:, i], y_pred=y_pred[:, i], average='binary', zero_division=0)[0:3]

        # or_y_pred = or_y_pred | y_pred[:, i]
        # or_y_true = or_y_true | y_true[:, i]

    metrics_df = metrics_df.sort_index()
    # metrics_df.loc['Weighted Average'] = metrics_df.transpose().dot(label_prop)
    # metrics_df.loc['Average'] = metrics_df.mean()

    # metrics_df.loc['Weighted Average', 'F1'] = 2 / (1/metrics_df.loc['Weighted Average', "Recall"] +
    #                                                 1/metrics_df.loc['Weighted Average', "Precision"])
    # metrics_df.loc['Average', 'F1'] = 2 / (1/metrics_df.loc['Average', "Recall"] +
    #                                        1/metrics_df.loc['Average', "Precision"])

    weighted_f1_score = metrics_df.loc['weighted avg', 'f1-score']
    averaged_f1_score = metrics_df.loc['macro avg', 'f1-score']

    # Calculate confusion matrix
    tn, fp, fn, tp  = confusion_matrix(y_true, y_pred).ravel()
    cm_df = pd.DataFrame(columns = ['Predicted No', 'Predicted Yes'],  
                   index = ['Actual No', 'Actual Yes']) 
    # adding rows to an empty  
    # dataframe at existing index 
    cm_df.loc['Actual No'] = [tn,fp] 
    cm_df.loc['Actual Yes'] = [fn,tp]

    # pooled_f1_score = f1_score(y_pred=or_y_pred, y_true=or_y_true)

    return ave_loss, metrics_df, cm_df,{
        "weighted": weighted_f1_score,
        "averaged": averaged_f1_score,
    }


def model_test(model, data_loader):
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    model.to(device)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    num_batch = len(data_loader)
    # Below need to modify if change the input
    review_id, review_label, hmd_text, head_cust_text = [], [], [], []
    agent = []
    pred_logits = []

    # Evaluate data
    for step, batch in tqdm(enumerate(data_loader), desc="Inference...", total=num_batch):
        if "anecdote_lead_final" in batch.keys():
            review_label.extend(batch["anecdote_lead_final"])
        review_id.extend(batch["_id"].tolist())
        hmd_text.extend(batch["hmd_comments"])
        head_cust_text.extend(batch["head_cust"])
        agent.extend(batch["new_transcript_agent"])

        b_review_input_ids = batch["review_input_ids"].to(device)
        b_review_attention_mask = batch["review_attention_mask"].to(device)
        b_review_token_type_ids = batch["review_token_type_ids"].to(device)
        b_agent_input_ids = batch["agent_input_ids"].to(device)
        b_agent_attention_mask = batch["agent_attention_mask"].to(device)
        b_agent_token_type_ids = batch["agent_token_type_ids"].to(device)


        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            (logits,) = model(review_input_ids=b_review_input_ids,
                              review_token_type_ids=b_review_token_type_ids,
                              review_attention_mask=b_review_attention_mask,
                              agent_input_ids=b_agent_input_ids,
                              agent_token_type_ids=b_agent_token_type_ids,
                              agent_attention_mask=b_agent_attention_mask
                              )

        if logits.detach().cpu().numpy().size == 1:
            pred_logits.extend(logits.detach().cpu().numpy().reshape(1,))  
        else:
            pred_logits.extend(logits.detach().cpu().numpy())
            
    # End of an epoch of validation
    # put model to train mode again.
    model.train()
    pred_logits = np.array(pred_logits)
    pred_prob = np.exp(pred_logits)
    pred_prob = pred_prob / (1 + pred_prob)
    pred_label = pred_prob.copy()
    pred_label[pred_label < 0.5] = 0
    pred_label[pred_label >= 0.5] = 1
    # compute the f1 score for each tags
    d = {'Probability':pred_prob,'Anecdotes Prediction':pred_label}
    pred_df = pd.DataFrame(d, columns=['Probability','Anecdotes Prediction'])
    result_df = pd.DataFrame(
        {
            "review_id": review_id,
            "hmd_text": hmd_text,
            "head_cust_text": head_cust_text,
            "agent": agent
        }
    )
    if len(review_label) != 0:
        result_df["review_label"] =  [x.item() for x in review_label] 
    return pd.concat([result_df, pred_df], axis=1).set_index("review_id")

Can anyone help me how to fix this issue? Thank you so much!!!

Apparently an nn.Embedding layer is pushed to the wrong device.
Based on the posted code I cannot see where the issue might be, so could you try to narrow down the error by removing working parts of the code?

The loss_fct in the forward method might create issues, since you are pushing the pos_weight to the default device in:

loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight).cuda() 

Use .to(outputs.device) instead.

Hi ptrblck,

Thanks so much for your response.

I tried to change the loss_fct part but it gives me error:

AttributeError: 'tuple' object has no attribute 'device'

The error message shows up with model_train function.
This is a function I define that includes train/validate/test. I can see from the progress bar that the training part can be done without any issues. The problem starts with the validation part – which is calling this line:

ave_valid_loss, valid_f1_table, cm_table, f1_score = model_validate(model=model, data_loader=valid_data_loader)

And model_validate function is defined as below:

def model_validate(model, data_loader):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    label_prop = data_loader.dataset.dataset.label_prop()

    total_valid_loss = 0

    batch_size = data_loader.batch_size
    num_batch = len(data_loader)

    y_pred, y_true = [], []

    # Evaluate data
    for step, batch in tqdm(enumerate(data_loader), desc="Validation...", total=num_batch):
        b_review_input_ids = batch["review_input_ids"].to(device)
        b_review_attention_mask = batch["review_attention_mask"].to(device)
        b_review_token_type_ids = batch["review_token_type_ids"].to(device)
        b_agent_input_ids = batch["agent_input_ids"].to(device)
        b_agent_attention_mask = batch["agent_attention_mask"].to(device)
        b_agent_token_type_ids = batch["agent_token_type_ids"].to(device)

        b_binarized_label = batch["binarized_label"].to(device)

        with torch.no_grad():
            (loss, logits,) = model(review_input_ids=b_review_input_ids,
                                    review_attention_mask=b_review_attention_mask,
                                    review_token_type_ids=b_review_token_type_ids,
                                    agent_input_ids=b_agent_input_ids,
                                    agent_attention_mask=b_agent_attention_mask,
                                    agent_token_type_ids=b_agent_token_type_ids,

                                    labels=b_binarized_label)

        total_valid_loss += loss.item()

        transfored_logits = torch.sigmoid(logits)
        numpy_probas = transfored_logits.detach().cpu().numpy()
        y_pred.extend(numpy_probas)
        y_true.extend(b_binarized_label.cpu().numpy())
        
    # End of an epoch of validation

    # put model to train mode again.
    model.train()

    ave_loss = total_valid_loss / (num_batch * batch_size)

    y_pred = np.array(y_pred)
    y_pred[y_pred < 0.5] = 0
    y_pred[y_pred >= 0.5] = 1
    
    # Below is in case the input and target are not the same data format
    y_pred = np.array(y_pred, dtype=np.bool)
    y_true = np.array(y_true, dtype=np.bool)
    
    # compute the various f1 score for each label
    report = classification_report(y_true, y_pred, output_dict=True)
    metrics_df = pd.DataFrame(report).transpose()
    metrics_df = metrics_df.sort_index()

    weighted_f1_score = metrics_df.loc['weighted avg', 'f1-score']
    averaged_f1_score = metrics_df.loc['macro avg', 'f1-score']

    # Calculate confusion matrix
    tn, fp, fn, tp  = confusion_matrix(y_true, y_pred).ravel()
    cm_df = pd.DataFrame(columns = ['Predicted No', 'Predicted Yes'],  
                   index = ['Actual No', 'Actual Yes']) 

    cm_df.loc['Actual No'] = [tn,fp] 
    cm_df.loc['Actual Yes'] = [fn,tp]

    return ave_loss, metrics_df, cm_df,{
        "weighted": weighted_f1_score,
        "averaged": averaged_f1_score,
    }

The traceback:

RuntimeError                              Traceback (most recent call last)
<ipython-input-14-7ca45d5cff01> in <module>
      7 model_train(model=model, train_data_loader=data_loader["train"], valid_data_loader=data_loader["valid"],
      8             test_data_loader=data_loader["test"], optimizer=optimizer, scheduler=scheduler,
----> 9             num_epochs=args.num_epochs, seed=args.seed, logger=logger, out_dir=out_dir)

~/SageMaker/Anecdotes/train_valid_test.py in model_train(model, train_data_loader, valid_data_loader, test_data_loader, logger, optimizer, scheduler, num_epochs, seed, out_dir)
    162         t_valid = time.time()
    163         model.eval()
--> 164         ave_valid_loss, valid_f1_table, cm_table, f1_score = model_validate(model=model, data_loader=valid_data_loader)
    165         # Measure how long this epoch took.
    166         validation_time = format_time(time.time() - t_valid)

~/SageMaker/Anecdotes/train_valid_test.py in model_validate(model, data_loader)
    280                                     agent_token_type_ids=b_agent_token_type_ids,
    281 
--> 282                                     labels=b_binarized_label)
    283 
    284         if num_gpus > 1:

~/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

~/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
    153             return self.module(*inputs[0], **kwargs[0])
    154         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 155         outputs = self.parallel_apply(replicas, inputs, kwargs)
    156         return self.gather(outputs, self.output_device)
    157 

~/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
    163 
    164     def parallel_apply(self, replicas, inputs, kwargs):
--> 165         return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
    166 
    167     def gather(self, outputs, output_device):

~/anaconda3/envs/python3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
     83         output = results[i]
     84         if isinstance(output, ExceptionWrapper):
---> 85             output.reraise()
     86         outputs.append(output)
     87     return outputs

~/anaconda3/envs/python3/lib/python3.6/site-packages/torch/_utils.py in reraise(self)
    393             # (https://bugs.python.org/issue2651), so we work around it.
    394             msg = KeyErrorMessage(msg)
--> 395         raise self.exc_type(msg)

Thank you again for your help!

I missed that outputs is a tuple, so use outputs[0].device instead or any other tensor, which was pushed to the right device.