Hi, I implemented a simple model-parallel extension of Huggingface’s re-implementation of T5 (transformers v3.0.1). Basically, I have 4 GPUs and 12 layer model, and I want to run 3 layers on each GPU and finally place results to GPU:0. However, the computation fails in the backward pass (Expected all tensors to be on the same device… error). Its this a torch bug? Using torch version 1.6.0.
Here is the error log, note that I print GPU index before running each transformer layer:
2020-10-29 11:51:50,242 INFO root: Epoch 0
0
0
0
1
1
1
2
2
2
3
3
3
2020-10-29 11:51:59,424 INFO root: Finished after 0.15302967230478923 minutes.
2020-10-29 11:51:59,424 ERROR root: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0!
Exception raised from compute_types at /pytorch/aten/src/ATen/native/TensorIterator.cpp:230 (most recent call first):frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x2b2a860d71e2 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: at::TensorIterator::compute_types(at::TensorIteratorConfig const&) + 0xd26 (0x2b2a3bf1c316 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #2: at::TensorIterator::build(at::TensorIteratorConfig&) + 0x6b (0x2b2a3bf1efeb in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #3: at::TensorIterator::TensorIterator(at::TensorIteratorConfig&) + 0xdd (0x2b2a3bf1f65d in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #4: at::TensorIterator::binary_op(at::Tensor&, at::Tensor const&, at::Tensor const&, bool) + 0x14a (0x2b2a3bf1f80a in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #5: at::native::add_out(at::Tensor&, at::Tensor const&, at::Tensor const&, c10::Scalar) + 0x33 (0x2b2a3bc5db23 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0xf1df52 (0x2b2a4c177f52 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #7: <unknown function> + 0x2e9fad8 (0x2b2a3df38ad8 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0x3377258 (0x2b2a3e410258 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #9: torch::autograd::AccumulateGrad::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x38a (0x2b2a3e411aaa in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x3375bb7 (0x2b2a3e40ebb7 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #11: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x2b2a3e40a400 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #12: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x2b2a3e40afa1 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x2b2a3e403119 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x2b2a39ff94ba in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #15: <unknown function> + 0xc70f (0x2b2a3ae9170f in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #16: <unknown function> + 0x7ea5 (0x2b2a379b3ea5 in /lib64/libpthread.so.0)
frame #17: clone + 0x6d (0x2b2a37cc696d in /lib64/libc.so.6)
2020-10-29 11:51:59,428 ERROR root: Traceback (most recent call last):
File "fit_qa/scripts/reader/run_generative_train_fuse_dec_multigpu.py", line 97, in <module>
r = framework.fit()
File "/home/maskeduser/workaround/QA/fit_qa/scripts/reader/reader/generative_reader_trainer_fusedec.py", line 164, in fit
scheduler=scheduler)
File "/home/maskeduser/workaround/QA/fit_qa/scripts/reader/reader/generative_reader_trainer_fusedec.py", line 264, in train_epoch
raise e
File "/home/maskeduser/workaround/QA/fit_qa/scripts/reader/reader/generative_reader_trainer_fusedec.py", line 234, in train_epoch
loss.backward()
File "/home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/tensor.py", line 185, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/autograd/__init__.py", line 127, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0!
Exception raised from compute_types at /pytorch/aten/src/ATen/native/TensorIterator.cpp:230 (most recent call first):frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x2b2a860d71e2 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: at::TensorIterator::compute_types(at::TensorIteratorConfig const&) + 0xd26 (0x2b2a3bf1c316 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #2: at::TensorIterator::build(at::TensorIteratorConfig&) + 0x6b (0x2b2a3bf1efeb in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #3: at::TensorIterator::TensorIterator(at::TensorIteratorConfig&) + 0xdd (0x2b2a3bf1f65d in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #4: at::TensorIterator::binary_op(at::Tensor&, at::Tensor const&, at::Tensor const&, bool) + 0x14a (0x2b2a3bf1f80a in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #5: at::native::add_out(at::Tensor&, at::Tensor const&, at::Tensor const&, c10::Scalar) + 0x33 (0x2b2a3bc5db23 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0xf1df52 (0x2b2a4c177f52 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #7: <unknown function> + 0x2e9fad8 (0x2b2a3df38ad8 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0x3377258 (0x2b2a3e410258 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #9: torch::autograd::AccumulateGrad::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x38a (0x2b2a3e411aaa in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x3375bb7 (0x2b2a3e40ebb7 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #11: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x2b2a3e40a400 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #12: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x2b2a3e40afa1 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x2b2a3e403119 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x2b2a39ff94ba in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #15: <unknown function> + 0xc70f (0x2b2a3ae9170f in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #16: <unknown function> + 0x7ea5 (0x2b2a379b3ea5 in /lib64/libpthread.so.0)
frame #17: clone + 0x6d (0x2b2a37cc696d in /lib64/libc.so.6)
File "fit_qa/scripts/reader/run_generative_train_fuse_dec_multigpu.py", line 101, in <module>
raise be
File "fit_qa/scripts/reader/run_generative_train_fuse_dec_multigpu.py", line 97, in <module>
r = framework.fit()
File "/home/maskeduser/workaround/QA/fit_qa/scripts/reader/reader/generative_reader_trainer_fusedec.py", line 164, in fit
scheduler=scheduler)
File "/home/maskeduser/workaround/QA/fit_qa/scripts/reader/reader/generative_reader_trainer_fusedec.py", line 264, in train_epoch
raise e
File "/home/maskeduser/workaround/QA/fit_qa/scripts/reader/reader/generative_reader_trainer_fusedec.py", line 234, in train_epoch
loss.backward()
File "/home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/tensor.py", line 185, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/autograd/__init__.py", line 127, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0!
Exception raised from compute_types at /pytorch/aten/src/ATen/native/TensorIterator.cpp:230 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x2b2a860d71e2 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: at::TensorIterator::compute_types(at::TensorIteratorConfig const&) + 0xd26 (0x2b2a3bf1c316 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #2: at::TensorIterator::build(at::TensorIteratorConfig&) + 0x6b (0x2b2a3bf1efeb in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #3: at::TensorIterator::TensorIterator(at::TensorIteratorConfig&) + 0xdd (0x2b2a3bf1f65d in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #4: at::TensorIterator::binary_op(at::Tensor&, at::Tensor const&, at::Tensor const&, bool) + 0x14a (0x2b2a3bf1f80a in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #5: at::native::add_out(at::Tensor&, at::Tensor const&, at::Tensor const&, c10::Scalar) + 0x33 (0x2b2a3bc5db23 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0xf1df52 (0x2b2a4c177f52 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #7: <unknown function> + 0x2e9fad8 (0x2b2a3df38ad8 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0x3377258 (0x2b2a3e410258 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #9: torch::autograd::AccumulateGrad::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x38a (0x2b2a3e411aaa in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0x3375bb7 (0x2b2a3e40ebb7 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #11: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x2b2a3e40a400 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #12: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x2b2a3e40afa1 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x2b2a3e403119 in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x2b2a39ff94ba in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #15: <unknown function> + 0xc70f (0x2b2a3ae9170f in /home/maskeduser/miniconda3/envs/qaenv/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #16: <unknown function> + 0x7ea5 (0x2b2a379b3ea5 in /lib64/libpthread.so.0)
frame #17: clone + 0x6d (0x2b2a37cc696d in /lib64/libc.so.6)
0%| | 0/78424 [00:11<?, ?it/s]
Original code is here: https://github.com/huggingface/transformers/blob/fedabcd1545839798004b2b468f191ec2244442f/src/transformers/modeling_t5.py#L734
My modelparallel re-implementation (see lines tagged via # MP, as I have added only these)
@staticmethod
def _decode_modelparallel(
self,
model_parallel_splits,
input_ids=None,
attention_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
inputs_embeds=None,
head_mask=None,
past_key_value_states=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
):
use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
if self.is_decoder:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
inputs_embeds = self.embed_tokens(input_ids)
batch_size, seq_length = input_shape
if past_key_value_states is not None:
assert seq_length == 1, "Input shape is {}, but should be {} when using past_key_value_sates".format(
input_shape, (batch_size, 1)
)
# required mask seq length can be calculated via length of past
# key value states and seq_length = 1 for the last token
mask_seq_length = past_key_value_states[0][0].shape[2] + seq_length
else:
mask_seq_length = seq_length
if attention_mask is None:
attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device)
if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
encoder_seq_length = encoder_hidden_states.shape[1]
encoder_attention_mask = torch.ones(
batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
)
# initialize past_key_value_states with `None` if past does not exist
if past_key_value_states is None:
past_key_value_states = [None] * len(self.block)
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device)
if self.is_decoder and encoder_attention_mask is not None:
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed
head_mask = self.get_head_mask(head_mask, self.config.num_layers)
present_key_value_states = ()
all_hidden_states = ()
all_attentions = ()
position_bias = None
encoder_decoder_position_bias = None
hidden_states = self.dropout(inputs_embeds)
# MP ###
total_layers = len(self.block)
layers_per_gpu = total_layers // model_parallel_splits
########
for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
# MP ####
current_gpu_idx = i // layers_per_gpu
if current_gpu_idx > 0:
layer_module = layer_module.to(torch.device(f"cuda:{current_gpu_idx}"))
hidden_states = hidden_states.to(torch.device(f"cuda:{current_gpu_idx}"))
extended_attention_mask = extended_attention_mask.to(torch.device(f"cuda:{current_gpu_idx}"))
position_bias = position_bias.to(torch.device(f"cuda:{current_gpu_idx}"))
encoder_hidden_states = encoder_hidden_states.to(torch.device(f"cuda:{current_gpu_idx}"))
encoder_extended_attention_mask = encoder_extended_attention_mask.to(
torch.device(f"cuda:{current_gpu_idx}"))
encoder_decoder_position_bias = encoder_decoder_position_bias.to(
torch.device(f"cuda:{current_gpu_idx}"))
if past_key_value_state is not None:
past_key_value_state = past_key_value_state.to(torch.device(f"cuda:{current_gpu_idx}"))
#########
print(layer_module.layer[0].layer_norm.weight.get_device())
layer_outputs = layer_module(
hidden_states,
attention_mask=extended_attention_mask,
position_bias=position_bias,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
encoder_decoder_position_bias=encoder_decoder_position_bias,
head_mask=head_mask[i],
past_key_value_state=past_key_value_state,
use_cache=use_cache,
output_attentions=output_attentions,
)
# layer_outputs is a tuple with:
# hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
hidden_states, present_key_value_state = layer_outputs[:2]
if i == 0:
# We share the position biases between the layers - the first layer store them
# layer_outputs = hidden-states, key-value-states (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
position_bias = layer_outputs[3 if output_attentions else 2]
if self.is_decoder and encoder_hidden_states is not None:
encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 3]
# append next layer key value states
present_key_value_states = present_key_value_states + (present_key_value_state,)
if output_attentions:
all_attentions = all_attentions + (layer_outputs[2],) # We keep only self-attention weights for now
hidden_states = hidden_states.to(torch.device("cuda:0")) # MP
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.dropout(hidden_states)
# Add last layer
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if use_cache is True:
assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self)
outputs = outputs + (present_key_value_states,)
if output_hidden_states:
outputs = outputs + (all_hidden_states,)
if output_attentions:
outputs = outputs + (all_attentions,)
return outputs # last-layer hidden state, (presents,) (all hidden states), (all attentions)