Hello, I tried to load model which larger than my single gpu, so I seperate four part layer to my 4 gpus, after that ,I tried to send query to the model , and I got error in line
x = embed_tokens(input_ids)
my source code is below
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# 加載 tokenizer 和 Gemma 模型
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="google/gemma-7b-it", token='hf_xxxxxx')
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it")
# 假設模型層在 model.model.layers
layers = list(model.model.layers.children())
# 將模型的層分配到不同的 GPU
part1 = torch.nn.Sequential(*layers[:len(layers)//4]).to('cuda:0') # 前 1/4 層到 GPU 0
part2 = torch.nn.Sequential(*layers[len(layers)//4:len(layers)//2]).to('cuda:1') # 第二 1/4 層到 GPU 1
part3 = torch.nn.Sequential(*layers[len(layers)//2:3*len(layers)//4]).to('cuda:2') # 第三 1/4 層到 GPU 2
part4 = torch.nn.Sequential(*layers[3*len(layers)//4:]).to('cuda:3') # 最後 1/4 層到 GPU 3
# 處理嵌入層和輸出層
embed_tokens = model.model.embed_tokens.to('cuda:0') # 嵌入層放到 GPU 0
norm = model.model.norm.to('cuda:3') # 標準化層放到 GPU 3
lm_head = model.lm_head.to('cuda:3') # 輸出層放到 GPU 3
# 準備輸入數據
input_text = "What are the macronutrients, and what roles do they play in the human body?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda:0')
# 前向傳播:依次在每個 GPU 上運行模型部分,並在每步之間移動張量
with torch.no_grad():
# 嵌入層計算在 GPU 0 上
x = embed_tokens(input_ids)
# 第一部分層計算在 GPU 0 上
x = part1(x)
# 將數據從 GPU 0 移動到 GPU 1
x = x.to('cuda:1')
# 第二部分層計算在 GPU 1 上
x = part2(x)
# 將數據從 GPU 1 移動到 GPU 2
x = x.to('cuda:2')
# 第三部分層計算在 GPU 2 上
x = part3(x)
# 將數據從 GPU 2 移動到 GPU 3
x = x.to('cuda:3')
# 第四部分層計算在 GPU 3 上
x = part4(x)
# 最後的標準化和輸出層在 GPU 3 上
x = norm(x)
output = lm_head(x)
# 將輸出轉換為文本
generated_text = tokenizer.decode(output.argmax(dim=-1)[0], skip_special_tokens=True)
print(f"Generated text:\n{generated_text}")
I would really appreciate some help with this! Thanks in advance.