Tensors Are at Different Cuda Devices

I am using BERT model based on: https://github.com/huggingface/pytorch-pretrained-BERT

class MyBertBasedModel(BertPreTrainedModel):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    """

    def __init__(self, config, num_labels):
        super(MyBertBasedModel, self).__init__(config)
        self.num_labels = num_labels
        self.bidirectional = False
        self.bi_dim = 2 if self.bidirectional else 1
        self.bert = BertModel(config)
        self.lstm = torch.nn.LSTM(config.hidden_size, config.hidden_size, bidirectional=self.bidirectional)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
        self.hidden = (torch.zeros(self.bi_dim, 128, config.hidden_size, device=torch.device('cuda')),
                torch.zeros(self.bi_dim, 128, config.hidden_size, device=torch.device('cuda')))
        self.apply(self.init_bert_weights)


    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        sequence_output = self.dropout(sequence_output)
        # lstm
        lstm_out, _ = self.lstm(sequence_output, self.hidden)
        logits = self.classifier(lstm_out)
        # no lstm
        #logits = self.classifier(sequence_output)

        if labels is not None:
            loss_fct = MyLoss()
            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, self.num_labels)[active_loss]
            active_labels = labels.view(-1, self.num_labels)[active_loss]
            loss = loss_fct(active_logits, active_labels)
            return loss
        else:
            return logits

I am using .to("cuda") in order to move all four inputs to GPU:

loss = model(input_ids, segment_ids, input_mask, label_ids)
notice label_ids is of type torch.float while the others are torch.long (maybe this causes the problem?)

And get this Error:
RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:1 and hidden tensor at cuda:0

When I explicitly mention .to("cuda:0") for all inputs I get the same error.

When I don’t use parallelization (torch.nn.DataParallel(model)) I get CUDA out of memory.
this Error:
RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 10.92 GiB total capacity; 5.61 GiB already allocated; 35.50 MiB free; 122.32 MiB cached)

The full log:

raceback (most recent call last):
  File "/home/nlp/noalu/sync_to_pycharm/BERT-NER/main.py", line 655, in <module>
    main()
  File "/home/nlp/noalu/sync_to_pycharm/BERT-NER/main.py", line 548, in main
    loss = model(input_ids, segment_ids, input_mask, label_ids)
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 152, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 162, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in parallel_apply
    raise output
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 59, in _worker
    output = module(*input, **kwargs)
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/nlp/noalu/sync_to_pycharm/BERT-NER/bert.py", line 54, in forward
    lstm_out, _ = self.lstm(sequence_output, self.hidden)
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/modules/rnn.py", line 559, in forward
    return self.forward_tensor(input, hx)
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/modules/rnn.py", line 539, in forward_tensor
    output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
  File "/home/nlp/noalu/anaconda2/envs/py36/lib/python3.6/site-packages/torch/nn/modules/rnn.py", line 522, in forward_impl
    self.dropout, self.training, self.bidirectional, self.batch_first)
RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:1 and hidden tensor at cuda:0

Process finished with exit code 1

Solution by Yanai Elazar:

You can define an environment variable like this:
CUDA_VISIBLE_DEVICES=1

This way, only this gpu will be available for the running program, and you won’t leak into other gpus. This way in the code you need to run on a single gpu, and not specify one specifically.