I am new to transformer architecture and torch. I am trying to build a model that will generate answers to questions. For simplicity, I am using three sample questions and answers in a pandas dataframe. After finishing the training, it shows an error. I have tried changing many parts of the code, but due to my limited knowledge, I have been stuck with this error for the last two days. The code and errors are provided below. Please help me solve the error.
Code::
pip install -U torchtext==0.6
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset
import random
import numpy as np
import pandas as pd
data = {
'Question': ['What is the capital of France?', 'Who wrote Hamlet?', 'What is the meaning of life?'],
'Answer': ['Paris', 'William Shakespeare', '42']
}
df = pd.DataFrame(data)
df['Question'] = df['Question'].apply(lambda x: x.lower())
import spacy
nlp = spacy.load('en_core_web_sm')
question_field = Field(tokenize=lambda x: [tok.text for tok in nlp(x)], init_token='<sos>', eos_token='<eos>', lower=True, include_lengths=False, use_vocab=True)
answer_field = Field(tokenize=lambda x: [tok.text for tok in nlp(x)], init_token='<sos>', eos_token='<eos>', lower=True, include_lengths=False, use_vocab=True)
fields = [('Question', question_field), ('Answer', answer_field)]
examples = [torchtext.data.Example.fromlist([df['Question'][i], df['Answer'][i]], fields) for i in range(df.shape[0])]
question_field.build_vocab(df['Question'], min_freq=2)
answer_field.build_vocab(df['Answer'], min_freq=2)
dataset = torchtext.data.Dataset(examples, fields)
train_data, valid_data = dataset.split(split_ratio=0.8)
class TransformerModel(nn.Module):
def __init__(self, input_dim, output_dim, n_layers, n_heads, hidden_dim, dropout):
super().__init__()
self.embedding = nn.Embedding(input_dim, hidden_dim)
self.encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(hidden_dim, n_heads, hidden_dim, dropout), n_layers)
self.decoder = nn.TransformerDecoder(nn.TransformerDecoderLayer(hidden_dim, n_heads, hidden_dim, dropout), n_layers)
self.fc_out = nn.Linear(hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, src, trg):
src = self.embedding(src)
trg = self.embedding(trg)
src = self.dropout(src)
trg = self.dropout(trg)
encoder_output = self.encoder(src)
decoder_output = self.decoder(trg, encoder_output)
output = self.fc_out(decoder_output)
return output
input_dim = len(question_field.vocab)
output_dim = len(answer_field.vocab)
n_layers = 6
n_heads = 4
hidden_dim = 512
dropout = 0.1
model = TransformerModel(input_dim, output_dim, n_layers, n_heads, hidden_dim, dropout)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 64
train_iterator, valid_iterator = BucketIterator.splits(
(train_data, valid_data),
batch_size=BATCH_SIZE,
sort_within_batch=True,
sort_key=lambda x: len(x.Question),
device=device
)
def train(model, iterator, optimizer, criterion):
model.train()
epoch_loss = 0
for batch in iterator:
src = batch.Question
trg = batch.Answer
optimizer.zero_grad()
output = model(src, trg)
output_dim = output.shape[-1]
output = output.view(-1, output_dim)
trg = trg.transpose(0, 1)
trg = torch.tensor([answer_field.vocab.stoi[token.item()] for token in trg.reshape(-1)], dtype=torch.long, device=device)
loss = criterion(output, trg)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(iterator)
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
train_loss = train(model, train_iterator, optimizer, criterion)
print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}')
# Generate responses
def generate_response(model, question_field, answer_field, question):
model.eval()
tokenized_question = question_field.tokenize(question)
tokenized_question = [question_field.init_token] + tokenized_question + [question_field.eos_token]
numerical_question = [question_field.vocab.stoi[token] for token in tokenized_question]
src_tensor = torch.LongTensor(numerical_question).unsqueeze(0).to(device)
trg_tensor = torch.zeros(100).long().unsqueeze(0).to(device)
with torch.no_grad():
output = model(src_tensor, trg_tensor)
output = output.squeeze(0)
generated_answer = [answer_field.vocab.itos[idx] for idx in output.argmax(1).cpu().numpy()]
generated_answer = generated_answer[1:] # Remove <sos>
if '<eos>' in generated_answer:
generated_answer = generated_answer[:generated_answer.index('<eos>')]
return ' '.join(generated_answer)
# Example usage
question = "What is the capital of France?"
response = generate_response(model, question_field, answer_field, question)
print("Response:", response)
Error::
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-129-bb346f3fdb29> in <cell line: 26>()
24 # Example usage
25 question = "What is the capital of France?"
---> 26 response = generate_response(model, question_field, answer_field, question)
27 print("Response:", response)
14 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in multi_head_attention_forward(query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight, q_proj_weight, k_proj_weight, v_proj_weight, static_k, static_v, average_attn_weights, is_causal)
5380 q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
5381 if static_k is None:
-> 5382 k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
5383 else:
5384 # TODO finish disentangling control flow so we don't do in-projections when statics are passed
RuntimeError: shape '[1, 400, 128]' is invalid for input of size 4608